xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision 2d30e087755efd99e28fdfe792ffbeb2ee1ea928)
1 #include <h2opusconf.h>
2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4 #include <h2opus.h>
5 #if defined(H2OPUS_USE_MPI)
6 #include <h2opus/distributed/distributed_h2opus_handle.h>
7 #include <h2opus/distributed/distributed_geometric_construction.h>
8 #include <h2opus/distributed/distributed_hgemv.h>
9 #include <h2opus/distributed/distributed_horthog.h>
10 #include <h2opus/distributed/distributed_hcompress.h>
11 #endif
12 #include <h2opus/util/boxentrygen.h>
13 #include <petsc/private/matimpl.h>
14 #include <petsc/private/vecimpl.h>
15 #include <petsc/private/deviceimpl.h>
16 #include <petscsf.h>
17 
18 /* math2opusutils */
19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *);
20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *);
21 PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
24 
25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26 
27 /* Use GPU only if H2OPUS is configured for GPU */
28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29 #define PETSC_H2OPUS_USE_GPU
30 #endif
31 #if defined(PETSC_H2OPUS_USE_GPU)
32 #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
33 #else
34 #define MatH2OpusUpdateIfNeeded(A, B) 0
35 #endif
36 
37 // TODO H2OPUS:
38 // DistributedHMatrix
39 //   unsymmetric ?
40 //   transpose for distributed_hgemv?
41 //   clearData()
42 // Unify interface for sequential and parallel?
43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44 //
45 template <class T>
46 class PetscPointCloud : public H2OpusDataSet<T> {
47 private:
48   int            dimension;
49   size_t         num_points;
50   std::vector<T> pts;
51 
52 public:
53   PetscPointCloud(int dim, size_t num_pts, const T coords[]) {
54     dim              = dim > 0 ? dim : 1;
55     this->dimension  = dim;
56     this->num_points = num_pts;
57 
58     pts.resize(num_pts * dim);
59     if (coords) {
60       for (size_t n = 0; n < num_pts; n++)
61         for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
62     } else {
63       PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
64       for (size_t n = 0; n < num_pts; n++) {
65         pts[n * dim] = n * h;
66         for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
67       }
68     }
69   }
70 
71   PetscPointCloud(const PetscPointCloud<T> &other) {
72     size_t N         = other.dimension * other.num_points;
73     this->dimension  = other.dimension;
74     this->num_points = other.num_points;
75     this->pts.resize(N);
76     for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
77   }
78 
79   int getDimension() const { return dimension; }
80 
81   size_t getDataSetSize() const { return num_points; }
82 
83   T getDataPoint(size_t idx, int dim) const {
84     assert(dim < dimension && idx < num_points);
85     return pts[idx * dimension + dim];
86   }
87 
88   void Print(std::ostream &out = std::cout) {
89     out << "Dimension: " << dimension << std::endl;
90     out << "NumPoints: " << num_points << std::endl;
91     for (size_t n = 0; n < num_points; n++) {
92       for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
93       out << std::endl;
94     }
95   }
96 };
97 
98 template <class T>
99 class PetscFunctionGenerator {
100 private:
101   MatH2OpusKernel k;
102   int             dim;
103   void           *ctx;
104 
105 public:
106   PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx) {
107     this->k   = k;
108     this->dim = dim;
109     this->ctx = ctx;
110   }
111   PetscFunctionGenerator(PetscFunctionGenerator &other) {
112     this->k   = other.k;
113     this->dim = other.dim;
114     this->ctx = other.ctx;
115   }
116   T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
117 };
118 
119 #include <../src/mat/impls/h2opus/math2opussampler.hpp>
120 
121 /* just to not clutter the code */
122 #if !defined(H2OPUS_USE_GPU)
123 typedef HMatrix HMatrix_GPU;
124 #if defined(H2OPUS_USE_MPI)
125 typedef DistributedHMatrix DistributedHMatrix_GPU;
126 #endif
127 #endif
128 
129 typedef struct {
130 #if defined(H2OPUS_USE_MPI)
131   distributedH2OpusHandle_t handle;
132 #else
133   h2opusHandle_t handle;
134 #endif
135   /* Sequential and parallel matrices are two different classes at the moment */
136   HMatrix *hmatrix;
137 #if defined(H2OPUS_USE_MPI)
138   DistributedHMatrix *dist_hmatrix;
139 #else
140   HMatrix       *dist_hmatrix;     /* just to not clutter the code */
141 #endif
142   /* May use permutations */
143   PetscSF                           sf;
144   PetscLayout                       h2opus_rmap, h2opus_cmap;
145   IS                                h2opus_indexmap;
146   thrust::host_vector<PetscScalar> *xx, *yy;
147   PetscInt                          xxs, yys;
148   PetscBool                         multsetup;
149 
150   /* GPU */
151   HMatrix_GPU *hmatrix_gpu;
152 #if defined(H2OPUS_USE_MPI)
153   DistributedHMatrix_GPU *dist_hmatrix_gpu;
154 #else
155   HMatrix_GPU   *dist_hmatrix_gpu; /* just to not clutter the code */
156 #endif
157 #if defined(PETSC_H2OPUS_USE_GPU)
158   thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
159   PetscInt                            xxs_gpu, yys_gpu;
160 #endif
161 
162   /* construction from matvecs */
163   PetscMatrixSampler *sampler;
164   PetscBool           nativemult;
165 
166   /* Admissibility */
167   PetscReal eta;
168   PetscInt  leafsize;
169 
170   /* for dof reordering */
171   PetscPointCloud<PetscReal> *ptcloud;
172 
173   /* kernel for generating matrix entries */
174   PetscFunctionGenerator<PetscScalar> *kernel;
175 
176   /* basis orthogonalized? */
177   PetscBool orthogonal;
178 
179   /* customization */
180   PetscInt  basisord;
181   PetscInt  max_rank;
182   PetscInt  bs;
183   PetscReal rtol;
184   PetscInt  norm_max_samples;
185   PetscBool check_construction;
186   PetscBool hara_verbose;
187   PetscBool resize;
188 
189   /* keeps track of MatScale values */
190   PetscScalar s;
191 } Mat_H2OPUS;
192 
193 static PetscErrorCode MatDestroy_H2OPUS(Mat A) {
194   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
195 
196   PetscFunctionBegin;
197 #if defined(H2OPUS_USE_MPI)
198   h2opusDestroyDistributedHandle(a->handle);
199 #else
200   h2opusDestroyHandle(a->handle);
201 #endif
202   delete a->dist_hmatrix;
203   delete a->hmatrix;
204   PetscCall(PetscSFDestroy(&a->sf));
205   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
206   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
207   PetscCall(ISDestroy(&a->h2opus_indexmap));
208   delete a->xx;
209   delete a->yy;
210   delete a->hmatrix_gpu;
211   delete a->dist_hmatrix_gpu;
212 #if defined(PETSC_H2OPUS_USE_GPU)
213   delete a->xx_gpu;
214   delete a->yy_gpu;
215 #endif
216   delete a->sampler;
217   delete a->ptcloud;
218   delete a->kernel;
219   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
220   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
221   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
222   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
223   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
224   PetscCall(PetscFree(A->data));
225   PetscFunctionReturn(0);
226 }
227 
228 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) {
229   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
230   PetscBool   ish2opus;
231 
232   PetscFunctionBegin;
233   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
234   PetscValidLogicalCollectiveBool(A, nm, 2);
235   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
236   if (ish2opus) {
237     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
238       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
239         PetscLayout t;
240         t              = A->rmap;
241         A->rmap        = a->h2opus_rmap;
242         a->h2opus_rmap = t;
243         t              = A->cmap;
244         A->cmap        = a->h2opus_cmap;
245         a->h2opus_cmap = t;
246       }
247     }
248     a->nativemult = nm;
249   }
250   PetscFunctionReturn(0);
251 }
252 
253 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) {
254   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
255   PetscBool   ish2opus;
256 
257   PetscFunctionBegin;
258   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
259   PetscValidPointer(nm, 2);
260   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
261   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
262   *nm = a->nativemult;
263   PetscFunctionReturn(0);
264 }
265 
266 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n) {
267   PetscBool   ish2opus;
268   PetscInt    nmax = PETSC_DECIDE;
269   Mat_H2OPUS *a    = NULL;
270   PetscBool   mult = PETSC_FALSE;
271 
272   PetscFunctionBegin;
273   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
274   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
275     a = (Mat_H2OPUS *)A->data;
276 
277     nmax = a->norm_max_samples;
278     mult = a->nativemult;
279     PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
280   } else {
281     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
282   }
283   PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
284   if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
285   PetscFunctionReturn(0);
286 }
287 
288 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) {
289   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
290   PetscInt    n;
291   PetscBool   boundtocpu = PETSC_TRUE;
292 
293   PetscFunctionBegin;
294 #if defined(PETSC_H2OPUS_USE_GPU)
295   boundtocpu = A->boundtocpu;
296 #endif
297   PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
298   if (boundtocpu) {
299     if (h2opus->xxs < xN) {
300       h2opus->xx->resize(n * xN);
301       h2opus->xxs = xN;
302     }
303     if (h2opus->yys < yN) {
304       h2opus->yy->resize(n * yN);
305       h2opus->yys = yN;
306     }
307   }
308 #if defined(PETSC_H2OPUS_USE_GPU)
309   if (!boundtocpu) {
310     if (h2opus->xxs_gpu < xN) {
311       h2opus->xx_gpu->resize(n * xN);
312       h2opus->xxs_gpu = xN;
313     }
314     if (h2opus->yys_gpu < yN) {
315       h2opus->yy_gpu->resize(n * yN);
316       h2opus->yys_gpu = yN;
317     }
318   }
319 #endif
320   PetscFunctionReturn(0);
321 }
322 
323 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) {
324   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
325 #if defined(H2OPUS_USE_MPI)
326   h2opusHandle_t handle = h2opus->handle->handle;
327 #else
328   h2opusHandle_t handle = h2opus->handle;
329 #endif
330   PetscBool    boundtocpu = PETSC_TRUE;
331   PetscScalar *xx, *yy, *uxx, *uyy;
332   PetscInt     blda, clda;
333   PetscMPIInt  size;
334   PetscSF      bsf, csf;
335   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
336 
337   PetscFunctionBegin;
338   HLibProfile::clear();
339 #if defined(PETSC_H2OPUS_USE_GPU)
340   boundtocpu = A->boundtocpu;
341 #endif
342   PetscCall(MatDenseGetLDA(B, &blda));
343   PetscCall(MatDenseGetLDA(C, &clda));
344   if (usesf) {
345     PetscInt n;
346 
347     PetscCall(MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf));
348     PetscCall(MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf));
349 
350     PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
351     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
352     blda = n;
353     clda = n;
354   }
355   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
356   if (boundtocpu) {
357     PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
358     PetscCall(MatDenseGetArrayWrite(C, &yy));
359     if (usesf) {
360       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
361       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
362       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
363       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
364     } else {
365       uxx = xx;
366       uyy = yy;
367     }
368     if (size > 1) {
369       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
370       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
371 #if defined(H2OPUS_USE_MPI)
372       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
373 #endif
374     } else {
375       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
376       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
377     }
378     PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
379     if (usesf) {
380       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
381       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
382     }
383     PetscCall(MatDenseRestoreArrayWrite(C, &yy));
384 #if defined(PETSC_H2OPUS_USE_GPU)
385   } else {
386     PetscBool ciscuda, biscuda;
387 
388     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
389     PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
390     if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
391     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
392     if (!ciscuda) {
393       C->assembled = PETSC_TRUE;
394       PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
395     }
396     PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
397     PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
398     if (usesf) {
399       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
400       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
401       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
402       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
403     } else {
404       uxx = xx;
405       uyy = yy;
406     }
407     PetscCall(PetscLogGpuTimeBegin());
408     if (size > 1) {
409       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
410       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
411 #if defined(H2OPUS_USE_MPI)
412       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
413 #endif
414     } else {
415       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
416       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
417     }
418     PetscCall(PetscLogGpuTimeEnd());
419     PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
420     if (usesf) {
421       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
422       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
423     }
424     PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
425     if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
426     if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
427 #endif
428   }
429   { /* log flops */
430     double gops, time, perf, dev;
431     HLibProfile::getHgemvPerf(gops, time, perf, dev);
432 #if defined(PETSC_H2OPUS_USE_GPU)
433     if (boundtocpu) {
434       PetscCall(PetscLogFlops(1e9 * gops));
435     } else {
436       PetscCall(PetscLogGpuFlops(1e9 * gops));
437     }
438 #else
439     PetscCall(PetscLogFlops(1e9 * gops));
440 #endif
441   }
442   PetscFunctionReturn(0);
443 }
444 
445 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) {
446   Mat_Product *product = C->product;
447 
448   PetscFunctionBegin;
449   MatCheckProduct(C, 1);
450   switch (product->type) {
451   case MATPRODUCT_AB: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C)); break;
452   case MATPRODUCT_AtB: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C)); break;
453   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
454   }
455   PetscFunctionReturn(0);
456 }
457 
458 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) {
459   Mat_Product *product = C->product;
460   PetscBool    cisdense;
461   Mat          A, B;
462 
463   PetscFunctionBegin;
464   MatCheckProduct(C, 1);
465   A = product->A;
466   B = product->B;
467   switch (product->type) {
468   case MATPRODUCT_AB:
469     PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
470     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
471     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
472     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
473     PetscCall(MatSetUp(C));
474     break;
475   case MATPRODUCT_AtB:
476     PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
477     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
478     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
479     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
480     PetscCall(MatSetUp(C));
481     break;
482   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
483   }
484   C->ops->productsymbolic = NULL;
485   C->ops->productnumeric  = MatProductNumeric_H2OPUS;
486   PetscFunctionReturn(0);
487 }
488 
489 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) {
490   PetscFunctionBegin;
491   MatCheckProduct(C, 1);
492   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
493   PetscFunctionReturn(0);
494 }
495 
496 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) {
497   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
498 #if defined(H2OPUS_USE_MPI)
499   h2opusHandle_t handle = h2opus->handle->handle;
500 #else
501   h2opusHandle_t handle = h2opus->handle;
502 #endif
503   PetscBool    boundtocpu = PETSC_TRUE;
504   PetscInt     n;
505   PetscScalar *xx, *yy, *uxx, *uyy;
506   PetscMPIInt  size;
507   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
508 
509   PetscFunctionBegin;
510   HLibProfile::clear();
511   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
512 #if defined(PETSC_H2OPUS_USE_GPU)
513   boundtocpu = A->boundtocpu;
514 #endif
515   if (usesf) {
516     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
517   } else n = A->rmap->n;
518   if (boundtocpu) {
519     PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
520     if (sy == 0.0) {
521       PetscCall(VecGetArrayWrite(y, &yy));
522     } else {
523       PetscCall(VecGetArray(y, &yy));
524     }
525     if (usesf) {
526       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
527       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
528 
529       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
530       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
531       if (sy != 0.0) {
532         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
533         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
534       }
535     } else {
536       uxx = xx;
537       uyy = yy;
538     }
539     if (size > 1) {
540       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
541       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
542 #if defined(H2OPUS_USE_MPI)
543       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
544 #endif
545     } else {
546       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
547       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
548     }
549     PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
550     if (usesf) {
551       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
552       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
553     }
554     if (sy == 0.0) {
555       PetscCall(VecRestoreArrayWrite(y, &yy));
556     } else {
557       PetscCall(VecRestoreArray(y, &yy));
558     }
559 #if defined(PETSC_H2OPUS_USE_GPU)
560   } else {
561     PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
562     if (sy == 0.0) {
563       PetscCall(VecCUDAGetArrayWrite(y, &yy));
564     } else {
565       PetscCall(VecCUDAGetArray(y, &yy));
566     }
567     if (usesf) {
568       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
569       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
570 
571       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
572       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
573       if (sy != 0.0) {
574         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
575         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
576       }
577     } else {
578       uxx = xx;
579       uyy = yy;
580     }
581     PetscCall(PetscLogGpuTimeBegin());
582     if (size > 1) {
583       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
584       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
585 #if defined(H2OPUS_USE_MPI)
586       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
587 #endif
588     } else {
589       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
590       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
591     }
592     PetscCall(PetscLogGpuTimeEnd());
593     PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
594     if (usesf) {
595       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
596       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
597     }
598     if (sy == 0.0) {
599       PetscCall(VecCUDARestoreArrayWrite(y, &yy));
600     } else {
601       PetscCall(VecCUDARestoreArray(y, &yy));
602     }
603 #endif
604   }
605   { /* log flops */
606     double gops, time, perf, dev;
607     HLibProfile::getHgemvPerf(gops, time, perf, dev);
608 #if defined(PETSC_H2OPUS_USE_GPU)
609     if (boundtocpu) {
610       PetscCall(PetscLogFlops(1e9 * gops));
611     } else {
612       PetscCall(PetscLogGpuFlops(1e9 * gops));
613     }
614 #else
615     PetscCall(PetscLogFlops(1e9 * gops));
616 #endif
617   }
618   PetscFunctionReturn(0);
619 }
620 
621 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) {
622   PetscBool xiscuda, yiscuda;
623 
624   PetscFunctionBegin;
625   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
626   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
627   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
628   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) {
633   PetscBool xiscuda, yiscuda;
634 
635   PetscFunctionBegin;
636   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
637   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
638   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
639   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
640   PetscFunctionReturn(0);
641 }
642 
643 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) {
644   PetscBool xiscuda, ziscuda;
645 
646   PetscFunctionBegin;
647   PetscCall(VecCopy(y, z));
648   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
649   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
650   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
651   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
652   PetscFunctionReturn(0);
653 }
654 
655 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) {
656   PetscBool xiscuda, ziscuda;
657 
658   PetscFunctionBegin;
659   PetscCall(VecCopy(y, z));
660   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
661   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
662   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
663   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
664   PetscFunctionReturn(0);
665 }
666 
667 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) {
668   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
669 
670   PetscFunctionBegin;
671   a->s *= s;
672   PetscFunctionReturn(0);
673 }
674 
675 static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject) {
676   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
677 
678   PetscFunctionBegin;
679   PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
680   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
681   PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
682   PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
683   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
684   PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
685   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum bumber of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
686   PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
687   PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
688   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
689   PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
690   PetscOptionsHeadEnd();
691   PetscFunctionReturn(0);
692 }
693 
694 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *);
695 
696 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) {
697   Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
698   Vec                c;
699   PetscInt           spacedim;
700   const PetscScalar *coords;
701 
702   PetscFunctionBegin;
703   if (a->ptcloud) PetscFunctionReturn(0);
704   PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
705   if (!c && a->sampler) {
706     Mat S = a->sampler->GetSamplingMat();
707 
708     PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
709   }
710   if (!c) {
711     PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
712   } else {
713     PetscCall(VecGetArrayRead(c, &coords));
714     PetscCall(VecGetBlockSize(c, &spacedim));
715     PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
716     PetscCall(VecRestoreArrayRead(c, &coords));
717   }
718   PetscFunctionReturn(0);
719 }
720 
721 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) {
722   MPI_Comm      comm;
723   PetscMPIInt   size;
724   Mat_H2OPUS   *a = (Mat_H2OPUS *)A->data;
725   PetscInt      n = 0, *idx = NULL;
726   int          *iidx = NULL;
727   PetscCopyMode own;
728   PetscBool     rid;
729 
730   PetscFunctionBegin;
731   if (a->multsetup) PetscFunctionReturn(0);
732   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
733     PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
734 #if defined(PETSC_H2OPUS_USE_GPU)
735     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
736     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
737     a->xxs_gpu = 1;
738     a->yys_gpu = 1;
739 #endif
740     a->xx  = new thrust::host_vector<PetscScalar>(n);
741     a->yy  = new thrust::host_vector<PetscScalar>(n);
742     a->xxs = 1;
743     a->yys = 1;
744   } else {
745     IS is;
746     PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
747     PetscCallMPI(MPI_Comm_size(comm, &size));
748     if (!a->h2opus_indexmap) {
749       if (size > 1) {
750         PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
751 #if defined(H2OPUS_USE_MPI)
752         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
753         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
754 #endif
755       } else {
756         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
757         n    = a->hmatrix->u_basis_tree.index_map.size();
758       }
759 
760       if (PetscDefined(USE_64BIT_INDICES)) {
761         PetscInt i;
762 
763         own = PETSC_OWN_POINTER;
764         PetscCall(PetscMalloc1(n, &idx));
765         for (i = 0; i < n; i++) idx[i] = iidx[i];
766       } else {
767         own = PETSC_COPY_VALUES;
768         idx = (PetscInt *)iidx;
769       }
770       PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
771       PetscCall(ISSetPermutation(is));
772       PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
773       a->h2opus_indexmap = is;
774     }
775     PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
776     PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
777     rid = (PetscBool)(n == A->rmap->n);
778     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm));
779     if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
780     if (!rid) {
781       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
782         PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
783         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
784         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
785         PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
786       }
787       PetscCall(PetscSFCreate(comm, &a->sf));
788       PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
789       PetscCall(PetscSFSetUp(a->sf));
790       PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
791 #if defined(PETSC_H2OPUS_USE_GPU)
792       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
793       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
794       a->xxs_gpu = 1;
795       a->yys_gpu = 1;
796 #endif
797       a->xx  = new thrust::host_vector<PetscScalar>(n);
798       a->yy  = new thrust::host_vector<PetscScalar>(n);
799       a->xxs = 1;
800       a->yys = 1;
801     }
802     PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
803   }
804   a->multsetup = PETSC_TRUE;
805   PetscFunctionReturn(0);
806 }
807 
808 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) {
809   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
810 #if defined(H2OPUS_USE_MPI)
811   h2opusHandle_t handle = a->handle->handle;
812 #else
813   h2opusHandle_t handle = a->handle;
814 #endif
815   PetscBool   kernel       = PETSC_FALSE;
816   PetscBool   boundtocpu   = PETSC_TRUE;
817   PetscBool   samplingdone = PETSC_FALSE;
818   MPI_Comm    comm;
819   PetscMPIInt size;
820 
821   PetscFunctionBegin;
822   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
823   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
824   PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
825 
826   /* XXX */
827   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
828 
829   PetscCallMPI(MPI_Comm_size(comm, &size));
830   /* TODO REUSABILITY of geometric construction */
831   delete a->hmatrix;
832   delete a->dist_hmatrix;
833 #if defined(PETSC_H2OPUS_USE_GPU)
834   delete a->hmatrix_gpu;
835   delete a->dist_hmatrix_gpu;
836 #endif
837   a->orthogonal = PETSC_FALSE;
838 
839   /* TODO: other? */
840   H2OpusBoxCenterAdmissibility adm(a->eta);
841 
842   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
843   if (size > 1) {
844 #if defined(H2OPUS_USE_MPI)
845     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
846 #else
847     a->dist_hmatrix = NULL;
848 #endif
849   } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
850   PetscCall(MatH2OpusInferCoordinates_Private(A));
851   PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
852   if (a->kernel) {
853     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
854     if (size > 1) {
855       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
856 #if defined(H2OPUS_USE_MPI)
857       buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
858 #endif
859     } else {
860       buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
861     }
862     kernel = PETSC_TRUE;
863   } else {
864     PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
865     buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
866   }
867   PetscCall(MatSetUpMultiply_H2OPUS(A));
868 
869 #if defined(PETSC_H2OPUS_USE_GPU)
870   boundtocpu = A->boundtocpu;
871   if (!boundtocpu) {
872     if (size > 1) {
873       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
874 #if defined(H2OPUS_USE_MPI)
875       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
876 #endif
877     } else {
878       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
879     }
880   }
881 #endif
882   if (size == 1) {
883     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
884       PetscReal Anorm;
885       bool      verbose;
886 
887       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
888       verbose = a->hara_verbose;
889       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
890       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
891       if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
892       a->sampler->SetStream(handle->getMainStream());
893       if (boundtocpu) {
894         a->sampler->SetGPUSampling(false);
895         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
896 #if defined(PETSC_H2OPUS_USE_GPU)
897       } else {
898         a->sampler->SetGPUSampling(true);
899         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
900 #endif
901       }
902       samplingdone = PETSC_TRUE;
903     }
904   }
905 #if defined(PETSC_H2OPUS_USE_GPU)
906   if (!boundtocpu) {
907     delete a->hmatrix;
908     delete a->dist_hmatrix;
909     a->hmatrix      = NULL;
910     a->dist_hmatrix = NULL;
911   }
912   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
913 #endif
914   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
915 
916   if (!a->s) a->s = 1.0;
917   A->assembled = PETSC_TRUE;
918 
919   if (samplingdone) {
920     PetscBool check  = a->check_construction;
921     PetscBool checke = PETSC_FALSE;
922 
923     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
924     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
925     if (check) {
926       Mat       E, Ae;
927       PetscReal n1, ni, n2;
928       PetscReal n1A, niA, n2A;
929       void (*normfunc)(void);
930 
931       Ae = a->sampler->GetSamplingMat();
932       PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
933       PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
934       PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
935       PetscCall(MatNorm(E, NORM_1, &n1));
936       PetscCall(MatNorm(E, NORM_INFINITY, &ni));
937       PetscCall(MatNorm(E, NORM_2, &n2));
938       if (checke) {
939         Mat eA, eE, eAe;
940 
941         PetscCall(MatComputeOperator(A, MATAIJ, &eA));
942         PetscCall(MatComputeOperator(E, MATAIJ, &eE));
943         PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
944         PetscCall(MatChop(eA, PETSC_SMALL));
945         PetscCall(MatChop(eE, PETSC_SMALL));
946         PetscCall(MatChop(eAe, PETSC_SMALL));
947         PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
948         PetscCall(MatView(eA, NULL));
949         PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
950         PetscCall(MatView(eAe, NULL));
951         PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
952         PetscCall(MatView(eE, NULL));
953         PetscCall(MatDestroy(&eA));
954         PetscCall(MatDestroy(&eE));
955         PetscCall(MatDestroy(&eAe));
956       }
957 
958       PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
959       PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
960       PetscCall(MatNorm(Ae, NORM_1, &n1A));
961       PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
962       PetscCall(MatNorm(Ae, NORM_2, &n2A));
963       n1A = PetscMax(n1A, PETSC_SMALL);
964       n2A = PetscMax(n2A, PETSC_SMALL);
965       niA = PetscMax(niA, PETSC_SMALL);
966       PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
967       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
968       PetscCall(MatDestroy(&E));
969     }
970     a->sampler->SetSamplingMat(NULL);
971   }
972   PetscFunctionReturn(0);
973 }
974 
975 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) {
976   PetscMPIInt size;
977   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
978 
979   PetscFunctionBegin;
980   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
981   PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
982   a->hmatrix->clearData();
983 #if defined(PETSC_H2OPUS_USE_GPU)
984   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
985 #endif
986   PetscFunctionReturn(0);
987 }
988 
989 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) {
990   Mat         A;
991   Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
992 #if defined(PETSC_H2OPUS_USE_GPU)
993   PetscBool iscpu = PETSC_FALSE;
994 #else
995   PetscBool iscpu = PETSC_TRUE;
996 #endif
997   MPI_Comm comm;
998 
999   PetscFunctionBegin;
1000   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1001   PetscCall(MatCreate(comm, &A));
1002   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1003   PetscCall(MatSetType(A, MATH2OPUS));
1004   PetscCall(MatPropagateSymmetryOptions(B, A));
1005   a = (Mat_H2OPUS *)A->data;
1006 
1007   a->eta              = b->eta;
1008   a->leafsize         = b->leafsize;
1009   a->basisord         = b->basisord;
1010   a->max_rank         = b->max_rank;
1011   a->bs               = b->bs;
1012   a->rtol             = b->rtol;
1013   a->norm_max_samples = b->norm_max_samples;
1014   if (op == MAT_COPY_VALUES) a->s = b->s;
1015 
1016   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1017   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1018 
1019 #if defined(H2OPUS_USE_MPI)
1020   if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1021 #if defined(PETSC_H2OPUS_USE_GPU)
1022   if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1023 #endif
1024 #endif
1025   if (b->hmatrix) {
1026     a->hmatrix = new HMatrix(*b->hmatrix);
1027     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1028   }
1029 #if defined(PETSC_H2OPUS_USE_GPU)
1030   if (b->hmatrix_gpu) {
1031     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1032     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1033   }
1034 #endif
1035   if (b->sf) {
1036     PetscCall(PetscObjectReference((PetscObject)b->sf));
1037     a->sf = b->sf;
1038   }
1039   if (b->h2opus_indexmap) {
1040     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1041     a->h2opus_indexmap = b->h2opus_indexmap;
1042   }
1043 
1044   PetscCall(MatSetUp(A));
1045   PetscCall(MatSetUpMultiply_H2OPUS(A));
1046   if (op == MAT_COPY_VALUES) {
1047     A->assembled  = PETSC_TRUE;
1048     a->orthogonal = b->orthogonal;
1049 #if defined(PETSC_H2OPUS_USE_GPU)
1050     A->offloadmask = B->offloadmask;
1051 #endif
1052   }
1053 #if defined(PETSC_H2OPUS_USE_GPU)
1054   iscpu = B->boundtocpu;
1055 #endif
1056   PetscCall(MatBindToCPU(A, iscpu));
1057 
1058   *nA = A;
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) {
1063   Mat_H2OPUS       *h2opus = (Mat_H2OPUS *)A->data;
1064   PetscBool         isascii, vieweps;
1065   PetscMPIInt       size;
1066   PetscViewerFormat format;
1067 
1068   PetscFunctionBegin;
1069   PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1070   PetscCall(PetscViewerGetFormat(view, &format));
1071   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1072   if (isascii) {
1073     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1074       if (size == 1) {
1075         FILE *fp;
1076         PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1077         dumpHMatrix(*h2opus->hmatrix, 6, fp);
1078       }
1079     } else {
1080       PetscCall(PetscViewerASCIIPrintf(view, "  H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1081       PetscCall(PetscViewerASCIIPrintf(view, "  PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1082       PetscCall(PetscViewerASCIIPrintf(view, "  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1083       if (!h2opus->kernel) {
1084         PetscCall(PetscViewerASCIIPrintf(view, "  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1085       } else {
1086         PetscCall(PetscViewerASCIIPrintf(view, "  Offdiagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1087       }
1088       PetscCall(PetscViewerASCIIPrintf(view, "  Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1089       if (size == 1) {
1090         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1091         double low_rank_cpu  = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1092 #if defined(PETSC_HAVE_CUDA)
1093         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1094         double low_rank_gpu  = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1095 #endif
1096         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1097 #if defined(PETSC_HAVE_CUDA)
1098         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1099 #endif
1100       } else {
1101 #if defined(PETSC_HAVE_CUDA)
1102         double      matrix_mem[4] = {0., 0., 0., 0.};
1103         PetscMPIInt rsize         = 4;
1104 #else
1105         double      matrix_mem[2] = {0., 0.};
1106         PetscMPIInt rsize         = 2;
1107 #endif
1108 #if defined(H2OPUS_USE_MPI)
1109         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1110         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1111 #if defined(PETSC_HAVE_CUDA)
1112         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1113         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1114 #endif
1115 #endif
1116         PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1117         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1118 #if defined(PETSC_HAVE_CUDA)
1119         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1120 #endif
1121       }
1122     }
1123   }
1124   vieweps = PETSC_FALSE;
1125   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1126   if (vieweps) {
1127     char        filename[256];
1128     const char *name;
1129 
1130     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1131     PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1132     PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1133     outputEps(*h2opus->hmatrix, filename);
1134   }
1135   PetscFunctionReturn(0);
1136 }
1137 
1138 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx) {
1139   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1140   PetscReal  *gcoords;
1141   PetscInt    N;
1142   MPI_Comm    comm;
1143   PetscMPIInt size;
1144   PetscBool   cong;
1145 
1146   PetscFunctionBegin;
1147   PetscCall(PetscLayoutSetUp(A->rmap));
1148   PetscCall(PetscLayoutSetUp(A->cmap));
1149   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1150   PetscCall(MatHasCongruentLayouts(A, &cong));
1151   PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1152   N = A->rmap->N;
1153   PetscCallMPI(MPI_Comm_size(comm, &size));
1154   if (spacedim > 0 && size > 1 && cdist) {
1155     PetscSF      sf;
1156     MPI_Datatype dtype;
1157 
1158     PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1159     PetscCallMPI(MPI_Type_commit(&dtype));
1160 
1161     PetscCall(PetscSFCreate(comm, &sf));
1162     PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1163     PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1164     PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1165     PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1166     PetscCall(PetscSFDestroy(&sf));
1167     PetscCallMPI(MPI_Type_free(&dtype));
1168   } else gcoords = (PetscReal *)coords;
1169 
1170   delete h2opus->ptcloud;
1171   delete h2opus->kernel;
1172   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1173   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1174   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1175   A->preallocated = PETSC_TRUE;
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 #if defined(PETSC_H2OPUS_USE_GPU)
1180 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) {
1181   PetscMPIInt size;
1182   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1183 
1184   PetscFunctionBegin;
1185   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1186   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1187     if (size > 1) {
1188       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1189 #if defined(H2OPUS_USE_MPI)
1190       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1191       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1192 #endif
1193     } else {
1194       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1195       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1196       else *a->hmatrix = *a->hmatrix_gpu;
1197     }
1198     delete a->hmatrix_gpu;
1199     delete a->dist_hmatrix_gpu;
1200     a->hmatrix_gpu      = NULL;
1201     a->dist_hmatrix_gpu = NULL;
1202     A->offloadmask      = PETSC_OFFLOAD_CPU;
1203   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1204     if (size > 1) {
1205       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1206 #if defined(H2OPUS_USE_MPI)
1207       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1208       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1209 #endif
1210     } else {
1211       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1212       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1213       else *a->hmatrix_gpu = *a->hmatrix;
1214     }
1215     delete a->hmatrix;
1216     delete a->dist_hmatrix;
1217     a->hmatrix      = NULL;
1218     a->dist_hmatrix = NULL;
1219     A->offloadmask  = PETSC_OFFLOAD_GPU;
1220   }
1221   PetscCall(PetscFree(A->defaultvectype));
1222   if (!flg) {
1223     PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1224   } else {
1225     PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1226   }
1227   A->boundtocpu = flg;
1228   PetscFunctionReturn(0);
1229 }
1230 #endif
1231 
1232 /*MC
1233      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1234 
1235    Options Database Keys:
1236 .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1237 
1238    Notes:
1239      H2Opus implements hierarchical matrices in the H^2 flavour. It supports CPU or NVIDIA GPUs.
1240 
1241      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1242      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1243 
1244    Level: beginner
1245 
1246    Reference:
1247 .  * -  "H2Opus: A distributed-memory multi-GPU software package for non-local operators", https://arxiv.org/abs/2109.05451
1248 
1249 .seealso: `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1250 M*/
1251 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) {
1252   Mat_H2OPUS *a;
1253   PetscMPIInt size;
1254 
1255   PetscFunctionBegin;
1256 #if defined(PETSC_H2OPUS_USE_GPU)
1257   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1258 #endif
1259   PetscCall(PetscNewLog(A, &a));
1260   A->data = (void *)a;
1261 
1262   a->eta              = 0.9;
1263   a->leafsize         = 32;
1264   a->basisord         = 4;
1265   a->max_rank         = 64;
1266   a->bs               = 32;
1267   a->rtol             = 1.e-4;
1268   a->s                = 1.0;
1269   a->norm_max_samples = 10;
1270   a->resize           = PETSC_TRUE; /* reallocate after compression */
1271 #if defined(H2OPUS_USE_MPI)
1272   h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1273 #else
1274   h2opusCreateHandle(&a->handle);
1275 #endif
1276   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1277   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1278   PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1279 
1280   A->ops->destroy          = MatDestroy_H2OPUS;
1281   A->ops->view             = MatView_H2OPUS;
1282   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1283   A->ops->mult             = MatMult_H2OPUS;
1284   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1285   A->ops->multadd          = MatMultAdd_H2OPUS;
1286   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1287   A->ops->scale            = MatScale_H2OPUS;
1288   A->ops->duplicate        = MatDuplicate_H2OPUS;
1289   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1290   A->ops->norm             = MatNorm_H2OPUS;
1291   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1292 #if defined(PETSC_H2OPUS_USE_GPU)
1293   A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1294 #endif
1295 
1296   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1297   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1298   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1299   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1300 #if defined(PETSC_H2OPUS_USE_GPU)
1301   PetscCall(PetscFree(A->defaultvectype));
1302   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1303 #endif
1304   PetscFunctionReturn(0);
1305 }
1306 
1307 /*@C
1308      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1309 
1310    Input Parameter:
1311 .     A - the matrix
1312 
1313    Level: intermediate
1314 
1315 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1316 @*/
1317 PetscErrorCode MatH2OpusOrthogonalize(Mat A) {
1318   PetscBool   ish2opus;
1319   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1320   PetscMPIInt size;
1321   PetscBool   boundtocpu = PETSC_TRUE;
1322 
1323   PetscFunctionBegin;
1324   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1325   PetscValidType(A, 1);
1326   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1327   if (!ish2opus) PetscFunctionReturn(0);
1328   if (a->orthogonal) PetscFunctionReturn(0);
1329   HLibProfile::clear();
1330   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1331 #if defined(PETSC_H2OPUS_USE_GPU)
1332   boundtocpu = A->boundtocpu;
1333 #endif
1334   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1335   if (size > 1) {
1336     if (boundtocpu) {
1337       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1338 #if defined(H2OPUS_USE_MPI)
1339       distributed_horthog(*a->dist_hmatrix, a->handle);
1340 #endif
1341 #if defined(PETSC_H2OPUS_USE_GPU)
1342       A->offloadmask = PETSC_OFFLOAD_CPU;
1343     } else {
1344       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1345       PetscCall(PetscLogGpuTimeBegin());
1346 #if defined(H2OPUS_USE_MPI)
1347       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1348 #endif
1349       PetscCall(PetscLogGpuTimeEnd());
1350 #endif
1351     }
1352   } else {
1353 #if defined(H2OPUS_USE_MPI)
1354     h2opusHandle_t handle = a->handle->handle;
1355 #else
1356     h2opusHandle_t handle = a->handle;
1357 #endif
1358     if (boundtocpu) {
1359       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1360       horthog(*a->hmatrix, handle);
1361 #if defined(PETSC_H2OPUS_USE_GPU)
1362       A->offloadmask = PETSC_OFFLOAD_CPU;
1363     } else {
1364       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1365       PetscCall(PetscLogGpuTimeBegin());
1366       horthog(*a->hmatrix_gpu, handle);
1367       PetscCall(PetscLogGpuTimeEnd());
1368 #endif
1369     }
1370   }
1371   a->orthogonal = PETSC_TRUE;
1372   { /* log flops */
1373     double gops, time, perf, dev;
1374     HLibProfile::getHorthogPerf(gops, time, perf, dev);
1375 #if defined(PETSC_H2OPUS_USE_GPU)
1376     if (boundtocpu) {
1377       PetscCall(PetscLogFlops(1e9 * gops));
1378     } else {
1379       PetscCall(PetscLogGpuFlops(1e9 * gops));
1380     }
1381 #else
1382     PetscCall(PetscLogFlops(1e9 * gops));
1383 #endif
1384   }
1385   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1386   PetscFunctionReturn(0);
1387 }
1388 
1389 /*@C
1390      MatH2OpusCompress - Compress a hierarchical matrix.
1391 
1392    Input Parameters:
1393 +     A - the matrix
1394 -     tol - the absolute truncation threshold
1395 
1396    Level: intermediate
1397 
1398 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1399 @*/
1400 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) {
1401   PetscBool   ish2opus;
1402   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1403   PetscMPIInt size;
1404   PetscBool   boundtocpu = PETSC_TRUE;
1405 
1406   PetscFunctionBegin;
1407   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1408   PetscValidType(A, 1);
1409   PetscValidLogicalCollectiveReal(A, tol, 2);
1410   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1411   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0);
1412   PetscCall(MatH2OpusOrthogonalize(A));
1413   HLibProfile::clear();
1414   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1415 #if defined(PETSC_H2OPUS_USE_GPU)
1416   boundtocpu = A->boundtocpu;
1417 #endif
1418   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1419   if (size > 1) {
1420     if (boundtocpu) {
1421       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1422 #if defined(H2OPUS_USE_MPI)
1423       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1424       if (a->resize) {
1425         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1426         delete a->dist_hmatrix;
1427         a->dist_hmatrix = dist_hmatrix;
1428       }
1429 #endif
1430 #if defined(PETSC_H2OPUS_USE_GPU)
1431       A->offloadmask = PETSC_OFFLOAD_CPU;
1432     } else {
1433       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1434       PetscCall(PetscLogGpuTimeBegin());
1435 #if defined(H2OPUS_USE_MPI)
1436       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1437 
1438       if (a->resize) {
1439         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1440         delete a->dist_hmatrix_gpu;
1441         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1442       }
1443 #endif
1444       PetscCall(PetscLogGpuTimeEnd());
1445 #endif
1446     }
1447   } else {
1448 #if defined(H2OPUS_USE_MPI)
1449     h2opusHandle_t handle = a->handle->handle;
1450 #else
1451     h2opusHandle_t handle = a->handle;
1452 #endif
1453     if (boundtocpu) {
1454       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1455       hcompress(*a->hmatrix, tol, handle);
1456 
1457       if (a->resize) {
1458         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1459         delete a->hmatrix;
1460         a->hmatrix = hmatrix;
1461       }
1462 #if defined(PETSC_H2OPUS_USE_GPU)
1463       A->offloadmask = PETSC_OFFLOAD_CPU;
1464     } else {
1465       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1466       PetscCall(PetscLogGpuTimeBegin());
1467       hcompress(*a->hmatrix_gpu, tol, handle);
1468       PetscCall(PetscLogGpuTimeEnd());
1469 
1470       if (a->resize) {
1471         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1472         delete a->hmatrix_gpu;
1473         a->hmatrix_gpu = hmatrix_gpu;
1474       }
1475 #endif
1476     }
1477   }
1478   { /* log flops */
1479     double gops, time, perf, dev;
1480     HLibProfile::getHcompressPerf(gops, time, perf, dev);
1481 #if defined(PETSC_H2OPUS_USE_GPU)
1482     if (boundtocpu) {
1483       PetscCall(PetscLogFlops(1e9 * gops));
1484     } else {
1485       PetscCall(PetscLogGpuFlops(1e9 * gops));
1486     }
1487 #else
1488     PetscCall(PetscLogFlops(1e9 * gops));
1489 #endif
1490   }
1491   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1492   PetscFunctionReturn(0);
1493 }
1494 
1495 /*@C
1496      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1497 
1498    Input Parameters:
1499 +     A - the hierarchical matrix
1500 .     B - the matrix to be sampled
1501 .     bs - maximum number of samples to be taken concurrently
1502 -     tol - relative tolerance for construction
1503 
1504    Notes:
1505    You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1506 
1507    Level: intermediate
1508 
1509 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1510 @*/
1511 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) {
1512   PetscBool ish2opus;
1513 
1514   PetscFunctionBegin;
1515   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1516   PetscValidType(A, 1);
1517   if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
1518   PetscValidLogicalCollectiveInt(A, bs, 3);
1519   PetscValidLogicalCollectiveReal(A, tol, 3);
1520   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1521   if (ish2opus) {
1522     Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1523 
1524     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1525     a->sampler->SetSamplingMat(B);
1526     if (bs > 0) a->bs = bs;
1527     if (tol > 0.) a->rtol = tol;
1528     delete a->kernel;
1529   }
1530   PetscFunctionReturn(0);
1531 }
1532 
1533 /*@C
1534      MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1535 
1536    Input Parameters:
1537 +     comm - MPI communicator
1538 .     m - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
1539 .     n - number of local columns (or `PETSC_DECIDE` to have calculated if N is given)
1540 .     M - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
1541 .     N - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
1542 .     spacedim - dimension of the space coordinates
1543 .     coords - coordinates of the points
1544 .     cdist - whether or not coordinates are distributed
1545 .     kernel - computational kernel (or NULL)
1546 .     kernelctx - kernel context
1547 .     eta - admissibility condition tolerance
1548 .     leafsize - leaf size in cluster tree
1549 -     basisord - approximation order for Chebychev interpolation of low-rank blocks
1550 
1551    Output Parameter:
1552 .     nA - matrix
1553 
1554    Options Database Keys:
1555 +     -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1556 .     -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1557 .     -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
1558 -     -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1559 
1560    Level: intermediate
1561 
1562 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1563 @*/
1564 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA) {
1565   Mat         A;
1566   Mat_H2OPUS *h2opus;
1567 #if defined(PETSC_H2OPUS_USE_GPU)
1568   PetscBool iscpu = PETSC_FALSE;
1569 #else
1570   PetscBool iscpu = PETSC_TRUE;
1571 #endif
1572 
1573   PetscFunctionBegin;
1574   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1575   PetscCall(MatCreate(comm, &A));
1576   PetscCall(MatSetSizes(A, m, n, M, N));
1577   PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1578   PetscCall(MatSetType(A, MATH2OPUS));
1579   PetscCall(MatBindToCPU(A, iscpu));
1580   PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1581 
1582   h2opus = (Mat_H2OPUS *)A->data;
1583   if (eta > 0.) h2opus->eta = eta;
1584   if (leafsize > 0) h2opus->leafsize = leafsize;
1585   if (basisord > 0) h2opus->basisord = basisord;
1586 
1587   *nA = A;
1588   PetscFunctionReturn(0);
1589 }
1590 
1591 /*@C
1592      MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1593 
1594    Input Parameters:
1595 +     B - the matrix to be sampled
1596 .     spacedim - dimension of the space coordinates
1597 .     coords - coordinates of the points
1598 .     cdist - whether or not coordinates are distributed
1599 .     eta - admissibility condition tolerance
1600 .     leafsize - leaf size in cluster tree
1601 .     maxrank - maximum rank allowed
1602 .     bs - maximum number of samples to be taken concurrently
1603 -     rtol - relative tolerance for construction
1604 
1605    Output Parameter:
1606 .     nA - matrix
1607 
1608    Options Database Keys:
1609 +     -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1610 .     -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1611 .     -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
1612 .     -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
1613 .     -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
1614 .     -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
1615 .     -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1616 -     -mat_h2opus_normsamples <`PetscInt`> - Maximum bumber of samples to be when estimating norms
1617 
1618    Note:
1619    Not available in parallel
1620 
1621    Level: intermediate
1622 
1623 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1624 @*/
1625 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) {
1626   Mat         A;
1627   Mat_H2OPUS *h2opus;
1628   MPI_Comm    comm;
1629   PetscBool   boundtocpu = PETSC_TRUE;
1630 
1631   PetscFunctionBegin;
1632   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
1633   PetscValidLogicalCollectiveInt(B, spacedim, 2);
1634   PetscValidLogicalCollectiveBool(B, cdist, 4);
1635   PetscValidLogicalCollectiveReal(B, eta, 5);
1636   PetscValidLogicalCollectiveInt(B, leafsize, 6);
1637   PetscValidLogicalCollectiveInt(B, maxrank, 7);
1638   PetscValidLogicalCollectiveInt(B, bs, 8);
1639   PetscValidLogicalCollectiveReal(B, rtol, 9);
1640   PetscValidPointer(nA, 10);
1641   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1642   PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1643   PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1644   PetscCall(MatCreate(comm, &A));
1645   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1646 #if defined(PETSC_H2OPUS_USE_GPU)
1647   {
1648     PetscBool iscuda;
1649     VecType   vtype;
1650 
1651     PetscCall(MatGetVecType(B, &vtype));
1652     PetscCall(PetscStrcmp(vtype, VECCUDA, &iscuda));
1653     if (!iscuda) {
1654       PetscCall(PetscStrcmp(vtype, VECSEQCUDA, &iscuda));
1655       if (!iscuda) PetscCall(PetscStrcmp(vtype, VECMPICUDA, &iscuda));
1656     }
1657     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1658   }
1659 #endif
1660   PetscCall(MatSetType(A, MATH2OPUS));
1661   PetscCall(MatBindToCPU(A, boundtocpu));
1662   if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1663   PetscCall(MatPropagateSymmetryOptions(B, A));
1664   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1665 
1666   h2opus          = (Mat_H2OPUS *)A->data;
1667   h2opus->sampler = new PetscMatrixSampler(B);
1668   if (eta > 0.) h2opus->eta = eta;
1669   if (leafsize > 0) h2opus->leafsize = leafsize;
1670   if (maxrank > 0) h2opus->max_rank = maxrank;
1671   if (bs > 0) h2opus->bs = bs;
1672   if (rtol > 0.) h2opus->rtol = rtol;
1673   *nA             = A;
1674   A->preallocated = PETSC_TRUE;
1675   PetscFunctionReturn(0);
1676 }
1677 
1678 /*@C
1679      MatH2OpusGetIndexMap - Access reordering index set.
1680 
1681    Input Parameters:
1682 .     A - the matrix
1683 
1684    Output Parameter:
1685 .     indexmap - the index set for the reordering
1686 
1687    Level: intermediate
1688 
1689 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1690 @*/
1691 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) {
1692   PetscBool   ish2opus;
1693   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1694 
1695   PetscFunctionBegin;
1696   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1697   PetscValidType(A, 1);
1698   PetscValidPointer(indexmap, 2);
1699   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1700   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1701   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1702   *indexmap = a->h2opus_indexmap;
1703   PetscFunctionReturn(0);
1704 }
1705 
1706 /*@C
1707      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1708 
1709    Input Parameters:
1710 +     A - the matrix
1711 .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1712 -     in - the vector to be mapped
1713 
1714    Output Parameter:
1715 .     out - the newly created mapped vector
1716 
1717    Level: intermediate
1718 
1719 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1720 @*/
1721 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out) {
1722   PetscBool    ish2opus;
1723   Mat_H2OPUS  *a = (Mat_H2OPUS *)A->data;
1724   PetscScalar *xin, *xout;
1725   PetscBool    nm;
1726 
1727   PetscFunctionBegin;
1728   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1729   PetscValidType(A, 1);
1730   PetscValidLogicalCollectiveBool(A, nativetopetsc, 2);
1731   PetscValidHeaderSpecific(in, VEC_CLASSID, 3);
1732   PetscValidPointer(out, 4);
1733   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1734   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1735   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1736   nm = a->nativemult;
1737   PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1738   PetscCall(MatCreateVecs(A, out, NULL));
1739   PetscCall(MatH2OpusSetNativeMult(A, nm));
1740   if (!a->sf) { /* same ordering */
1741     PetscCall(VecCopy(in, *out));
1742     PetscFunctionReturn(0);
1743   }
1744   PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1745   PetscCall(VecGetArrayWrite(*out, &xout));
1746   if (nativetopetsc) {
1747     PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1748     PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1749   } else {
1750     PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1751     PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1752   }
1753   PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1754   PetscCall(VecRestoreArrayWrite(*out, &xout));
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 /*@C
1759      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1760 
1761    Input Parameters:
1762 +     A - the hierarchical `MATH2OPUS` matrix
1763 .     s - the scaling factor
1764 .     U - the dense low-rank update matrix
1765 -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1766 
1767    Note:
1768    The U and V matrices must be in dense format
1769 
1770    Level: intermediate
1771 
1772 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1773 @*/
1774 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) {
1775   PetscBool flg;
1776 
1777   PetscFunctionBegin;
1778   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1779   PetscValidType(A, 1);
1780   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1781   PetscValidHeaderSpecific(U, MAT_CLASSID, 2);
1782   PetscCheckSameComm(A, 1, U, 2);
1783   if (V) {
1784     PetscValidHeaderSpecific(V, MAT_CLASSID, 3);
1785     PetscCheckSameComm(A, 1, V, 3);
1786   }
1787   PetscValidLogicalCollectiveScalar(A, s, 4);
1788 
1789   if (!V) V = U;
1790   PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1791   if (!U->cmap->N) PetscFunctionReturn(0);
1792   PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1793   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1794   PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1795   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1796   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1797   if (flg) {
1798     Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
1799     const PetscScalar *u, *v, *uu, *vv;
1800     PetscInt           ldu, ldv;
1801     PetscMPIInt        size;
1802 #if defined(H2OPUS_USE_MPI)
1803     h2opusHandle_t handle = a->handle->handle;
1804 #else
1805     h2opusHandle_t handle = a->handle;
1806 #endif
1807     PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1808     PetscSF   usf, vsf;
1809 
1810     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1811     PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1812     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1813     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1814     PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1815     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1816     PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1817     PetscCall(MatDenseGetLDA(U, &ldu));
1818     PetscCall(MatDenseGetLDA(V, &ldv));
1819     PetscCall(MatBoundToCPU(A, &flg));
1820     if (usesf) {
1821       PetscInt n;
1822 
1823       PetscCall(MatDenseGetH2OpusVectorSF(U, a->sf, &usf));
1824       PetscCall(MatDenseGetH2OpusVectorSF(V, a->sf, &vsf));
1825       PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1826       PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1827       ldu = n;
1828       ldv = n;
1829     }
1830     if (flg) {
1831       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1832       PetscCall(MatDenseGetArrayRead(U, &u));
1833       PetscCall(MatDenseGetArrayRead(V, &v));
1834       if (usesf) {
1835         vv = MatH2OpusGetThrustPointer(*a->yy);
1836         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1837         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1838         if (U != V) {
1839           uu = MatH2OpusGetThrustPointer(*a->xx);
1840           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1841           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1842         } else uu = vv;
1843       } else {
1844         uu = u;
1845         vv = v;
1846       }
1847       hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1848       PetscCall(MatDenseRestoreArrayRead(U, &u));
1849       PetscCall(MatDenseRestoreArrayRead(V, &v));
1850     } else {
1851 #if defined(PETSC_H2OPUS_USE_GPU)
1852       PetscBool flgU, flgV;
1853 
1854       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1855       PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1856       if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1857       PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1858       if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1859       PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1860       PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1861       if (usesf) {
1862         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1863         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1864         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1865         if (U != V) {
1866           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1867           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1868           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1869         } else uu = vv;
1870       } else {
1871         uu = u;
1872         vv = v;
1873       }
1874 #else
1875       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1876 #endif
1877       hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1878 #if defined(PETSC_H2OPUS_USE_GPU)
1879       PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1880       PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1881       if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1882       if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1883 #endif
1884     }
1885     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1886     a->orthogonal = PETSC_FALSE;
1887   }
1888   PetscFunctionReturn(0);
1889 }
1890 #endif
1891