xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision 66af8762ec03dbef0e079729eb2a1734a35ed7ff)
1 #include <h2opusconf.h>
2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4   #include <h2opus.h>
5   #if defined(H2OPUS_USE_MPI)
6     #include <h2opus/distributed/distributed_h2opus_handle.h>
7     #include <h2opus/distributed/distributed_geometric_construction.h>
8     #include <h2opus/distributed/distributed_hgemv.h>
9     #include <h2opus/distributed/distributed_horthog.h>
10     #include <h2opus/distributed/distributed_hcompress.h>
11   #endif
12   #include <h2opus/util/boxentrygen.h>
13   #include <petsc/private/matimpl.h>
14   #include <petsc/private/vecimpl.h>
15   #include <petsc/private/deviceimpl.h>
16   #include <petscsf.h>
17 
18 /* math2opusutils */
19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *);
20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *);
21 PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
24 
25   #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26 
27   /* Use GPU only if H2OPUS is configured for GPU */
28   #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29     #define PETSC_H2OPUS_USE_GPU
30   #endif
31   #if defined(PETSC_H2OPUS_USE_GPU)
32     #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
33   #else
34     #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
35   #endif
36 
37 // TODO H2OPUS:
38 // DistributedHMatrix
39 //   unsymmetric ?
40 //   transpose for distributed_hgemv?
41 //   clearData()
42 // Unify interface for sequential and parallel?
43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44 //
45 template <class T>
46 class PetscPointCloud : public H2OpusDataSet<T> {
47 private:
48   int            dimension;
49   size_t         num_points;
50   std::vector<T> pts;
51 
52 public:
53   PetscPointCloud(int dim, size_t num_pts, const T coords[])
54   {
55     dim              = dim > 0 ? dim : 1;
56     this->dimension  = dim;
57     this->num_points = num_pts;
58 
59     pts.resize(num_pts * dim);
60     if (coords) {
61       for (size_t n = 0; n < num_pts; n++)
62         for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
63     } else {
64       PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
65       for (size_t n = 0; n < num_pts; n++) {
66         pts[n * dim] = n * h;
67         for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
68       }
69     }
70   }
71 
72   PetscPointCloud(const PetscPointCloud<T> &other)
73   {
74     size_t N         = other.dimension * other.num_points;
75     this->dimension  = other.dimension;
76     this->num_points = other.num_points;
77     this->pts.resize(N);
78     for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
79   }
80 
81   int getDimension() const { return dimension; }
82 
83   size_t getDataSetSize() const { return num_points; }
84 
85   T getDataPoint(size_t idx, int dim) const
86   {
87     assert(dim < dimension && idx < num_points);
88     return pts[idx * dimension + dim];
89   }
90 
91   void Print(std::ostream &out = std::cout)
92   {
93     out << "Dimension: " << dimension << std::endl;
94     out << "NumPoints: " << num_points << std::endl;
95     for (size_t n = 0; n < num_points; n++) {
96       for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
97       out << std::endl;
98     }
99   }
100 };
101 
102 template <class T>
103 class PetscFunctionGenerator {
104 private:
105   MatH2OpusKernel k;
106   int             dim;
107   void           *ctx;
108 
109 public:
110   PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx)
111   {
112     this->k   = k;
113     this->dim = dim;
114     this->ctx = ctx;
115   }
116   PetscFunctionGenerator(PetscFunctionGenerator &other)
117   {
118     this->k   = other.k;
119     this->dim = other.dim;
120     this->ctx = other.ctx;
121   }
122   T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
123 };
124 
125   #include <../src/mat/impls/h2opus/math2opussampler.hpp>
126 
127   /* just to not clutter the code */
128   #if !defined(H2OPUS_USE_GPU)
129 typedef HMatrix HMatrix_GPU;
130     #if defined(H2OPUS_USE_MPI)
131 typedef DistributedHMatrix DistributedHMatrix_GPU;
132     #endif
133   #endif
134 
135 typedef struct {
136   #if defined(H2OPUS_USE_MPI)
137   distributedH2OpusHandle_t handle;
138   #else
139   h2opusHandle_t handle;
140   #endif
141   /* Sequential and parallel matrices are two different classes at the moment */
142   HMatrix *hmatrix;
143   #if defined(H2OPUS_USE_MPI)
144   DistributedHMatrix *dist_hmatrix;
145   #else
146   HMatrix       *dist_hmatrix;     /* just to not clutter the code */
147   #endif
148   /* May use permutations */
149   PetscSF                           sf;
150   PetscLayout                       h2opus_rmap, h2opus_cmap;
151   IS                                h2opus_indexmap;
152   thrust::host_vector<PetscScalar> *xx, *yy;
153   PetscInt                          xxs, yys;
154   PetscBool                         multsetup;
155 
156   /* GPU */
157   HMatrix_GPU *hmatrix_gpu;
158   #if defined(H2OPUS_USE_MPI)
159   DistributedHMatrix_GPU *dist_hmatrix_gpu;
160   #else
161   HMatrix_GPU   *dist_hmatrix_gpu; /* just to not clutter the code */
162   #endif
163   #if defined(PETSC_H2OPUS_USE_GPU)
164   thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
165   PetscInt                            xxs_gpu, yys_gpu;
166   #endif
167 
168   /* construction from matvecs */
169   PetscMatrixSampler *sampler;
170   PetscBool           nativemult;
171 
172   /* Admissibility */
173   PetscReal eta;
174   PetscInt  leafsize;
175 
176   /* for dof reordering */
177   PetscPointCloud<PetscReal> *ptcloud;
178 
179   /* kernel for generating matrix entries */
180   PetscFunctionGenerator<PetscScalar> *kernel;
181 
182   /* basis orthogonalized? */
183   PetscBool orthogonal;
184 
185   /* customization */
186   PetscInt  basisord;
187   PetscInt  max_rank;
188   PetscInt  bs;
189   PetscReal rtol;
190   PetscInt  norm_max_samples;
191   PetscBool check_construction;
192   PetscBool hara_verbose;
193   PetscBool resize;
194 
195   /* keeps track of MatScale values */
196   PetscScalar s;
197 } Mat_H2OPUS;
198 
199 static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200 {
201   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
202 
203   PetscFunctionBegin;
204   #if defined(H2OPUS_USE_MPI)
205   h2opusDestroyDistributedHandle(a->handle);
206   #else
207   h2opusDestroyHandle(a->handle);
208   #endif
209   delete a->dist_hmatrix;
210   delete a->hmatrix;
211   PetscCall(PetscSFDestroy(&a->sf));
212   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
213   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
214   PetscCall(ISDestroy(&a->h2opus_indexmap));
215   delete a->xx;
216   delete a->yy;
217   delete a->hmatrix_gpu;
218   delete a->dist_hmatrix_gpu;
219   #if defined(PETSC_H2OPUS_USE_GPU)
220   delete a->xx_gpu;
221   delete a->yy_gpu;
222   #endif
223   delete a->sampler;
224   delete a->ptcloud;
225   delete a->kernel;
226   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
227   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
228   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
229   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
230   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
231   PetscCall(PetscFree(A->data));
232   PetscFunctionReturn(PETSC_SUCCESS);
233 }
234 
235 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
236 {
237   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
238   PetscBool   ish2opus;
239 
240   PetscFunctionBegin;
241   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
242   PetscValidLogicalCollectiveBool(A, nm, 2);
243   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
244   if (ish2opus) {
245     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
246       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
247         PetscLayout t;
248         t              = A->rmap;
249         A->rmap        = a->h2opus_rmap;
250         a->h2opus_rmap = t;
251         t              = A->cmap;
252         A->cmap        = a->h2opus_cmap;
253         a->h2opus_cmap = t;
254       }
255     }
256     a->nativemult = nm;
257   }
258   PetscFunctionReturn(PETSC_SUCCESS);
259 }
260 
261 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
262 {
263   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
264   PetscBool   ish2opus;
265 
266   PetscFunctionBegin;
267   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
268   PetscAssertPointer(nm, 2);
269   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
270   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
271   *nm = a->nativemult;
272   PetscFunctionReturn(PETSC_SUCCESS);
273 }
274 
275 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
276 {
277   PetscBool   ish2opus;
278   PetscInt    nmax = PETSC_DECIDE;
279   Mat_H2OPUS *a    = NULL;
280   PetscBool   mult = PETSC_FALSE;
281 
282   PetscFunctionBegin;
283   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
284   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
285     a = (Mat_H2OPUS *)A->data;
286 
287     nmax = a->norm_max_samples;
288     mult = a->nativemult;
289     PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
290   } else {
291     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
292   }
293   PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
294   if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
295   PetscFunctionReturn(PETSC_SUCCESS);
296 }
297 
298 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
299 {
300   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
301   PetscInt    n;
302   PetscBool   boundtocpu = PETSC_TRUE;
303 
304   PetscFunctionBegin;
305   #if defined(PETSC_H2OPUS_USE_GPU)
306   boundtocpu = A->boundtocpu;
307   #endif
308   PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
309   if (boundtocpu) {
310     if (h2opus->xxs < xN) {
311       h2opus->xx->resize(n * xN);
312       h2opus->xxs = xN;
313     }
314     if (h2opus->yys < yN) {
315       h2opus->yy->resize(n * yN);
316       h2opus->yys = yN;
317     }
318   }
319   #if defined(PETSC_H2OPUS_USE_GPU)
320   if (!boundtocpu) {
321     if (h2opus->xxs_gpu < xN) {
322       h2opus->xx_gpu->resize(n * xN);
323       h2opus->xxs_gpu = xN;
324     }
325     if (h2opus->yys_gpu < yN) {
326       h2opus->yy_gpu->resize(n * yN);
327       h2opus->yys_gpu = yN;
328     }
329   }
330   #endif
331   PetscFunctionReturn(PETSC_SUCCESS);
332 }
333 
334 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
335 {
336   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
337   #if defined(H2OPUS_USE_MPI)
338   h2opusHandle_t handle = h2opus->handle->handle;
339   #else
340   h2opusHandle_t handle = h2opus->handle;
341   #endif
342   PetscBool    boundtocpu = PETSC_TRUE;
343   PetscScalar *xx, *yy, *uxx, *uyy;
344   PetscInt     blda, clda;
345   PetscMPIInt  size;
346   PetscSF      bsf, csf;
347   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
348 
349   PetscFunctionBegin;
350   HLibProfile::clear();
351   #if defined(PETSC_H2OPUS_USE_GPU)
352   boundtocpu = A->boundtocpu;
353   #endif
354   PetscCall(MatDenseGetLDA(B, &blda));
355   PetscCall(MatDenseGetLDA(C, &clda));
356   if (usesf) {
357     PetscInt n;
358 
359     PetscCall(MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf));
360     PetscCall(MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf));
361 
362     PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
363     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
364     blda = n;
365     clda = n;
366   }
367   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
368   if (boundtocpu) {
369     PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
370     PetscCall(MatDenseGetArrayWrite(C, &yy));
371     if (usesf) {
372       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
373       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
374       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
376     } else {
377       uxx = xx;
378       uyy = yy;
379     }
380     if (size > 1) {
381       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
382       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
383   #if defined(H2OPUS_USE_MPI)
384       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
385   #endif
386     } else {
387       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
388       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
389     }
390     PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
391     if (usesf) {
392       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
394     }
395     PetscCall(MatDenseRestoreArrayWrite(C, &yy));
396   #if defined(PETSC_H2OPUS_USE_GPU)
397   } else {
398     PetscBool ciscuda, biscuda;
399 
400     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
401     PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
402     if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
403     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
404     if (!ciscuda) {
405       C->assembled = PETSC_TRUE;
406       PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
407     }
408     PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
409     PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
410     if (usesf) {
411       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
412       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
413       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
415     } else {
416       uxx = xx;
417       uyy = yy;
418     }
419     PetscCall(PetscLogGpuTimeBegin());
420     if (size > 1) {
421       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
422       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
423     #if defined(H2OPUS_USE_MPI)
424       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
425     #endif
426     } else {
427       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
428       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
429     }
430     PetscCall(PetscLogGpuTimeEnd());
431     PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
432     if (usesf) {
433       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
435     }
436     PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
437     if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
438     if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
439   #endif
440   }
441   { /* log flops */
442     double gops, time, perf, dev;
443     HLibProfile::getHgemvPerf(gops, time, perf, dev);
444   #if defined(PETSC_H2OPUS_USE_GPU)
445     if (boundtocpu) {
446       PetscCall(PetscLogFlops(1e9 * gops));
447     } else {
448       PetscCall(PetscLogGpuFlops(1e9 * gops));
449     }
450   #else
451     PetscCall(PetscLogFlops(1e9 * gops));
452   #endif
453   }
454   PetscFunctionReturn(PETSC_SUCCESS);
455 }
456 
457 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
458 {
459   Mat_Product *product = C->product;
460 
461   PetscFunctionBegin;
462   MatCheckProduct(C, 1);
463   switch (product->type) {
464   case MATPRODUCT_AB:
465     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
466     break;
467   case MATPRODUCT_AtB:
468     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
469     break;
470   default:
471     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
472   }
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
476 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
477 {
478   Mat_Product *product = C->product;
479   PetscBool    cisdense;
480   Mat          A, B;
481 
482   PetscFunctionBegin;
483   MatCheckProduct(C, 1);
484   A = product->A;
485   B = product->B;
486   switch (product->type) {
487   case MATPRODUCT_AB:
488     PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
489     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
490     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
491     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
492     PetscCall(MatSetUp(C));
493     break;
494   case MATPRODUCT_AtB:
495     PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
496     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
497     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
498     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
499     PetscCall(MatSetUp(C));
500     break;
501   default:
502     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
503   }
504   C->ops->productsymbolic = NULL;
505   C->ops->productnumeric  = MatProductNumeric_H2OPUS;
506   PetscFunctionReturn(PETSC_SUCCESS);
507 }
508 
509 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
510 {
511   PetscFunctionBegin;
512   MatCheckProduct(C, 1);
513   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
514   PetscFunctionReturn(PETSC_SUCCESS);
515 }
516 
517 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
518 {
519   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
520   #if defined(H2OPUS_USE_MPI)
521   h2opusHandle_t handle = h2opus->handle->handle;
522   #else
523   h2opusHandle_t handle = h2opus->handle;
524   #endif
525   PetscBool    boundtocpu = PETSC_TRUE;
526   PetscInt     n;
527   PetscScalar *xx, *yy, *uxx, *uyy;
528   PetscMPIInt  size;
529   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
530 
531   PetscFunctionBegin;
532   HLibProfile::clear();
533   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
534   #if defined(PETSC_H2OPUS_USE_GPU)
535   boundtocpu = A->boundtocpu;
536   #endif
537   if (usesf) {
538     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
539   } else n = A->rmap->n;
540   if (boundtocpu) {
541     PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
542     if (sy == 0.0) {
543       PetscCall(VecGetArrayWrite(y, &yy));
544     } else {
545       PetscCall(VecGetArray(y, &yy));
546     }
547     if (usesf) {
548       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
549       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
550 
551       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
552       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
553       if (sy != 0.0) {
554         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
555         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
556       }
557     } else {
558       uxx = xx;
559       uyy = yy;
560     }
561     if (size > 1) {
562       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
563       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
564   #if defined(H2OPUS_USE_MPI)
565       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
566   #endif
567     } else {
568       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
569       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
570     }
571     PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
572     if (usesf) {
573       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
574       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
575     }
576     if (sy == 0.0) {
577       PetscCall(VecRestoreArrayWrite(y, &yy));
578     } else {
579       PetscCall(VecRestoreArray(y, &yy));
580     }
581   #if defined(PETSC_H2OPUS_USE_GPU)
582   } else {
583     PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
584     if (sy == 0.0) {
585       PetscCall(VecCUDAGetArrayWrite(y, &yy));
586     } else {
587       PetscCall(VecCUDAGetArray(y, &yy));
588     }
589     if (usesf) {
590       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
591       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
592 
593       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
594       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
595       if (sy != 0.0) {
596         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
597         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
598       }
599     } else {
600       uxx = xx;
601       uyy = yy;
602     }
603     PetscCall(PetscLogGpuTimeBegin());
604     if (size > 1) {
605       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
606       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
607     #if defined(H2OPUS_USE_MPI)
608       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
609     #endif
610     } else {
611       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
612       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
613     }
614     PetscCall(PetscLogGpuTimeEnd());
615     PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
616     if (usesf) {
617       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
618       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
619     }
620     if (sy == 0.0) {
621       PetscCall(VecCUDARestoreArrayWrite(y, &yy));
622     } else {
623       PetscCall(VecCUDARestoreArray(y, &yy));
624     }
625   #endif
626   }
627   { /* log flops */
628     double gops, time, perf, dev;
629     HLibProfile::getHgemvPerf(gops, time, perf, dev);
630   #if defined(PETSC_H2OPUS_USE_GPU)
631     if (boundtocpu) {
632       PetscCall(PetscLogFlops(1e9 * gops));
633     } else {
634       PetscCall(PetscLogGpuFlops(1e9 * gops));
635     }
636   #else
637     PetscCall(PetscLogFlops(1e9 * gops));
638   #endif
639   }
640   PetscFunctionReturn(PETSC_SUCCESS);
641 }
642 
643 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
644 {
645   PetscBool xiscuda, yiscuda;
646 
647   PetscFunctionBegin;
648   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
649   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
650   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
651   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
652   PetscFunctionReturn(PETSC_SUCCESS);
653 }
654 
655 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
656 {
657   PetscBool xiscuda, yiscuda;
658 
659   PetscFunctionBegin;
660   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
661   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
662   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
663   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
664   PetscFunctionReturn(PETSC_SUCCESS);
665 }
666 
667 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
668 {
669   PetscBool xiscuda, ziscuda;
670 
671   PetscFunctionBegin;
672   PetscCall(VecCopy(y, z));
673   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
674   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
675   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
676   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
677   PetscFunctionReturn(PETSC_SUCCESS);
678 }
679 
680 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
681 {
682   PetscBool xiscuda, ziscuda;
683 
684   PetscFunctionBegin;
685   PetscCall(VecCopy(y, z));
686   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
687   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
688   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
689   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
690   PetscFunctionReturn(PETSC_SUCCESS);
691 }
692 
693 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
694 {
695   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
696 
697   PetscFunctionBegin;
698   a->s *= s;
699   PetscFunctionReturn(PETSC_SUCCESS);
700 }
701 
702 static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject)
703 {
704   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
705 
706   PetscFunctionBegin;
707   PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
708   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
709   PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
710   PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
711   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
712   PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
713   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
714   PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
715   PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
716   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
717   PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
718   PetscOptionsHeadEnd();
719   PetscFunctionReturn(PETSC_SUCCESS);
720 }
721 
722 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *);
723 
724 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
725 {
726   Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
727   Vec                c;
728   PetscInt           spacedim;
729   const PetscScalar *coords;
730 
731   PetscFunctionBegin;
732   if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
733   PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
734   if (!c && a->sampler) {
735     Mat S = a->sampler->GetSamplingMat();
736 
737     PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
738   }
739   if (!c) {
740     PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
741   } else {
742     PetscCall(VecGetArrayRead(c, &coords));
743     PetscCall(VecGetBlockSize(c, &spacedim));
744     PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
745     PetscCall(VecRestoreArrayRead(c, &coords));
746   }
747   PetscFunctionReturn(PETSC_SUCCESS);
748 }
749 
750 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
751 {
752   MPI_Comm      comm;
753   PetscMPIInt   size;
754   Mat_H2OPUS   *a = (Mat_H2OPUS *)A->data;
755   PetscInt      n = 0, *idx = NULL;
756   int          *iidx = NULL;
757   PetscCopyMode own;
758   PetscBool     rid;
759 
760   PetscFunctionBegin;
761   if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
762   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
763     PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
764   #if defined(PETSC_H2OPUS_USE_GPU)
765     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
766     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
767     a->xxs_gpu = 1;
768     a->yys_gpu = 1;
769   #endif
770     a->xx  = new thrust::host_vector<PetscScalar>(n);
771     a->yy  = new thrust::host_vector<PetscScalar>(n);
772     a->xxs = 1;
773     a->yys = 1;
774   } else {
775     IS is;
776     PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
777     PetscCallMPI(MPI_Comm_size(comm, &size));
778     if (!a->h2opus_indexmap) {
779       if (size > 1) {
780         PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
781   #if defined(H2OPUS_USE_MPI)
782         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
783         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
784   #endif
785       } else {
786         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
787         n    = a->hmatrix->u_basis_tree.index_map.size();
788       }
789 
790       if (PetscDefined(USE_64BIT_INDICES)) {
791         PetscInt i;
792 
793         own = PETSC_OWN_POINTER;
794         PetscCall(PetscMalloc1(n, &idx));
795         for (i = 0; i < n; i++) idx[i] = iidx[i];
796       } else {
797         own = PETSC_COPY_VALUES;
798         idx = (PetscInt *)iidx;
799       }
800       PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
801       PetscCall(ISSetPermutation(is));
802       PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
803       a->h2opus_indexmap = is;
804     }
805     PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
806     PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
807     rid = (PetscBool)(n == A->rmap->n);
808     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm));
809     if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
810     if (!rid) {
811       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
812         PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
813         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
814         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
815         PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
816       }
817       PetscCall(PetscSFCreate(comm, &a->sf));
818       PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
819       PetscCall(PetscSFSetUp(a->sf));
820       PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
821   #if defined(PETSC_H2OPUS_USE_GPU)
822       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
823       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
824       a->xxs_gpu = 1;
825       a->yys_gpu = 1;
826   #endif
827       a->xx  = new thrust::host_vector<PetscScalar>(n);
828       a->yy  = new thrust::host_vector<PetscScalar>(n);
829       a->xxs = 1;
830       a->yys = 1;
831     }
832     PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
833   }
834   a->multsetup = PETSC_TRUE;
835   PetscFunctionReturn(PETSC_SUCCESS);
836 }
837 
838 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
839 {
840   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
841   #if defined(H2OPUS_USE_MPI)
842   h2opusHandle_t handle = a->handle->handle;
843   #else
844   h2opusHandle_t handle = a->handle;
845   #endif
846   PetscBool   kernel       = PETSC_FALSE;
847   PetscBool   boundtocpu   = PETSC_TRUE;
848   PetscBool   samplingdone = PETSC_FALSE;
849   MPI_Comm    comm;
850   PetscMPIInt size;
851 
852   PetscFunctionBegin;
853   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
854   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
855   PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
856 
857   /* XXX */
858   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
859 
860   PetscCallMPI(MPI_Comm_size(comm, &size));
861   /* TODO REUSABILITY of geometric construction */
862   delete a->hmatrix;
863   delete a->dist_hmatrix;
864   #if defined(PETSC_H2OPUS_USE_GPU)
865   delete a->hmatrix_gpu;
866   delete a->dist_hmatrix_gpu;
867   #endif
868   a->orthogonal = PETSC_FALSE;
869 
870   /* TODO: other? */
871   H2OpusBoxCenterAdmissibility adm(a->eta);
872 
873   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
874   if (size > 1) {
875   #if defined(H2OPUS_USE_MPI)
876     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
877   #else
878     a->dist_hmatrix = NULL;
879   #endif
880   } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
881   PetscCall(MatH2OpusInferCoordinates_Private(A));
882   PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
883   if (a->kernel) {
884     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
885     if (size > 1) {
886       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
887   #if defined(H2OPUS_USE_MPI)
888       buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
889   #endif
890     } else {
891       buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
892     }
893     kernel = PETSC_TRUE;
894   } else {
895     PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
896     buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
897   }
898   PetscCall(MatSetUpMultiply_H2OPUS(A));
899 
900   #if defined(PETSC_H2OPUS_USE_GPU)
901   boundtocpu = A->boundtocpu;
902   if (!boundtocpu) {
903     if (size > 1) {
904       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
905     #if defined(H2OPUS_USE_MPI)
906       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
907     #endif
908     } else {
909       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
910     }
911   }
912   #endif
913   if (size == 1) {
914     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
915       PetscReal Anorm;
916       bool      verbose;
917 
918       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
919       verbose = a->hara_verbose;
920       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
921       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
922       if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
923       a->sampler->SetStream(handle->getMainStream());
924       if (boundtocpu) {
925         a->sampler->SetGPUSampling(false);
926         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
927   #if defined(PETSC_H2OPUS_USE_GPU)
928       } else {
929         a->sampler->SetGPUSampling(true);
930         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
931   #endif
932       }
933       samplingdone = PETSC_TRUE;
934     }
935   }
936   #if defined(PETSC_H2OPUS_USE_GPU)
937   if (!boundtocpu) {
938     delete a->hmatrix;
939     delete a->dist_hmatrix;
940     a->hmatrix      = NULL;
941     a->dist_hmatrix = NULL;
942   }
943   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
944   #endif
945   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
946 
947   if (!a->s) a->s = 1.0;
948   A->assembled = PETSC_TRUE;
949 
950   if (samplingdone) {
951     PetscBool check  = a->check_construction;
952     PetscBool checke = PETSC_FALSE;
953 
954     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
955     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
956     if (check) {
957       Mat       E, Ae;
958       PetscReal n1, ni, n2;
959       PetscReal n1A, niA, n2A;
960       void (*normfunc)(void);
961 
962       Ae = a->sampler->GetSamplingMat();
963       PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
964       PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
965       PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
966       PetscCall(MatNorm(E, NORM_1, &n1));
967       PetscCall(MatNorm(E, NORM_INFINITY, &ni));
968       PetscCall(MatNorm(E, NORM_2, &n2));
969       if (checke) {
970         Mat eA, eE, eAe;
971 
972         PetscCall(MatComputeOperator(A, MATAIJ, &eA));
973         PetscCall(MatComputeOperator(E, MATAIJ, &eE));
974         PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
975         PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
976         PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
977         PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
978         PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
979         PetscCall(MatView(eA, NULL));
980         PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
981         PetscCall(MatView(eAe, NULL));
982         PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
983         PetscCall(MatView(eE, NULL));
984         PetscCall(MatDestroy(&eA));
985         PetscCall(MatDestroy(&eE));
986         PetscCall(MatDestroy(&eAe));
987       }
988 
989       PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
990       PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
991       PetscCall(MatNorm(Ae, NORM_1, &n1A));
992       PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
993       PetscCall(MatNorm(Ae, NORM_2, &n2A));
994       n1A = PetscMax(n1A, PETSC_SMALL);
995       n2A = PetscMax(n2A, PETSC_SMALL);
996       niA = PetscMax(niA, PETSC_SMALL);
997       PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
998       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
999       PetscCall(MatDestroy(&E));
1000     }
1001     a->sampler->SetSamplingMat(NULL);
1002   }
1003   PetscFunctionReturn(PETSC_SUCCESS);
1004 }
1005 
1006 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1007 {
1008   PetscMPIInt size;
1009   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1010 
1011   PetscFunctionBegin;
1012   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1013   PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1014   a->hmatrix->clearData();
1015   #if defined(PETSC_H2OPUS_USE_GPU)
1016   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1017   #endif
1018   PetscFunctionReturn(PETSC_SUCCESS);
1019 }
1020 
1021 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1022 {
1023   Mat         A;
1024   Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1025   #if defined(PETSC_H2OPUS_USE_GPU)
1026   PetscBool iscpu = PETSC_FALSE;
1027   #else
1028   PetscBool iscpu = PETSC_TRUE;
1029   #endif
1030   MPI_Comm comm;
1031 
1032   PetscFunctionBegin;
1033   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1034   PetscCall(MatCreate(comm, &A));
1035   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1036   PetscCall(MatSetType(A, MATH2OPUS));
1037   PetscCall(MatPropagateSymmetryOptions(B, A));
1038   a = (Mat_H2OPUS *)A->data;
1039 
1040   a->eta              = b->eta;
1041   a->leafsize         = b->leafsize;
1042   a->basisord         = b->basisord;
1043   a->max_rank         = b->max_rank;
1044   a->bs               = b->bs;
1045   a->rtol             = b->rtol;
1046   a->norm_max_samples = b->norm_max_samples;
1047   if (op == MAT_COPY_VALUES) a->s = b->s;
1048 
1049   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1050   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1051 
1052   #if defined(H2OPUS_USE_MPI)
1053   if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1054     #if defined(PETSC_H2OPUS_USE_GPU)
1055   if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1056     #endif
1057   #endif
1058   if (b->hmatrix) {
1059     a->hmatrix = new HMatrix(*b->hmatrix);
1060     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1061   }
1062   #if defined(PETSC_H2OPUS_USE_GPU)
1063   if (b->hmatrix_gpu) {
1064     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1065     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1066   }
1067   #endif
1068   if (b->sf) {
1069     PetscCall(PetscObjectReference((PetscObject)b->sf));
1070     a->sf = b->sf;
1071   }
1072   if (b->h2opus_indexmap) {
1073     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1074     a->h2opus_indexmap = b->h2opus_indexmap;
1075   }
1076 
1077   PetscCall(MatSetUp(A));
1078   PetscCall(MatSetUpMultiply_H2OPUS(A));
1079   if (op == MAT_COPY_VALUES) {
1080     A->assembled  = PETSC_TRUE;
1081     a->orthogonal = b->orthogonal;
1082   #if defined(PETSC_H2OPUS_USE_GPU)
1083     A->offloadmask = B->offloadmask;
1084   #endif
1085   }
1086   #if defined(PETSC_H2OPUS_USE_GPU)
1087   iscpu = B->boundtocpu;
1088   #endif
1089   PetscCall(MatBindToCPU(A, iscpu));
1090 
1091   *nA = A;
1092   PetscFunctionReturn(PETSC_SUCCESS);
1093 }
1094 
1095 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1096 {
1097   Mat_H2OPUS       *h2opus = (Mat_H2OPUS *)A->data;
1098   PetscBool         isascii, vieweps;
1099   PetscMPIInt       size;
1100   PetscViewerFormat format;
1101 
1102   PetscFunctionBegin;
1103   PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1104   PetscCall(PetscViewerGetFormat(view, &format));
1105   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1106   if (isascii) {
1107     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1108       if (size == 1) {
1109         FILE *fp;
1110         PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1111         dumpHMatrix(*h2opus->hmatrix, 6, fp);
1112       }
1113     } else {
1114       PetscCall(PetscViewerASCIIPrintf(view, "  H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1115       PetscCall(PetscViewerASCIIPrintf(view, "  PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1116       PetscCall(PetscViewerASCIIPrintf(view, "  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1117       if (!h2opus->kernel) {
1118         PetscCall(PetscViewerASCIIPrintf(view, "  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1119       } else {
1120         PetscCall(PetscViewerASCIIPrintf(view, "  Offdiagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1121       }
1122       PetscCall(PetscViewerASCIIPrintf(view, "  Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1123       if (size == 1) {
1124         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1125         double low_rank_cpu  = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1126   #if defined(PETSC_HAVE_CUDA)
1127         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1128         double low_rank_gpu  = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1129   #endif
1130         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1131   #if defined(PETSC_HAVE_CUDA)
1132         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1133   #endif
1134       } else {
1135   #if defined(PETSC_HAVE_CUDA)
1136         double      matrix_mem[4] = {0., 0., 0., 0.};
1137         PetscMPIInt rsize         = 4;
1138   #else
1139         double      matrix_mem[2] = {0., 0.};
1140         PetscMPIInt rsize         = 2;
1141   #endif
1142   #if defined(H2OPUS_USE_MPI)
1143         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1144         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1145     #if defined(PETSC_HAVE_CUDA)
1146         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1147         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1148     #endif
1149   #endif
1150         PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1151         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1152   #if defined(PETSC_HAVE_CUDA)
1153         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1154   #endif
1155       }
1156     }
1157   }
1158   vieweps = PETSC_FALSE;
1159   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1160   if (vieweps) {
1161     char        filename[256];
1162     const char *name;
1163 
1164     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1165     PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1166     PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1167     outputEps(*h2opus->hmatrix, filename);
1168   }
1169   PetscFunctionReturn(PETSC_SUCCESS);
1170 }
1171 
1172 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1173 {
1174   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1175   PetscReal  *gcoords;
1176   PetscInt    N;
1177   MPI_Comm    comm;
1178   PetscMPIInt size;
1179   PetscBool   cong;
1180 
1181   PetscFunctionBegin;
1182   PetscCall(PetscLayoutSetUp(A->rmap));
1183   PetscCall(PetscLayoutSetUp(A->cmap));
1184   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1185   PetscCall(MatHasCongruentLayouts(A, &cong));
1186   PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1187   N = A->rmap->N;
1188   PetscCallMPI(MPI_Comm_size(comm, &size));
1189   if (spacedim > 0 && size > 1 && cdist) {
1190     PetscSF      sf;
1191     MPI_Datatype dtype;
1192 
1193     PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1194     PetscCallMPI(MPI_Type_commit(&dtype));
1195 
1196     PetscCall(PetscSFCreate(comm, &sf));
1197     PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1198     PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1199     PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1200     PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1201     PetscCall(PetscSFDestroy(&sf));
1202     PetscCallMPI(MPI_Type_free(&dtype));
1203   } else gcoords = (PetscReal *)coords;
1204 
1205   delete h2opus->ptcloud;
1206   delete h2opus->kernel;
1207   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1208   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1209   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1210   A->preallocated = PETSC_TRUE;
1211   PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213 
1214   #if defined(PETSC_H2OPUS_USE_GPU)
1215 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1216 {
1217   PetscMPIInt size;
1218   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1219 
1220   PetscFunctionBegin;
1221   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1222   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1223     if (size > 1) {
1224       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1225     #if defined(H2OPUS_USE_MPI)
1226       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1227       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1228     #endif
1229     } else {
1230       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1231       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1232       else *a->hmatrix = *a->hmatrix_gpu;
1233     }
1234     delete a->hmatrix_gpu;
1235     delete a->dist_hmatrix_gpu;
1236     a->hmatrix_gpu      = NULL;
1237     a->dist_hmatrix_gpu = NULL;
1238     A->offloadmask      = PETSC_OFFLOAD_CPU;
1239   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1240     if (size > 1) {
1241       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1242     #if defined(H2OPUS_USE_MPI)
1243       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1244       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1245     #endif
1246     } else {
1247       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1248       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1249       else *a->hmatrix_gpu = *a->hmatrix;
1250     }
1251     delete a->hmatrix;
1252     delete a->dist_hmatrix;
1253     a->hmatrix      = NULL;
1254     a->dist_hmatrix = NULL;
1255     A->offloadmask  = PETSC_OFFLOAD_GPU;
1256   }
1257   PetscCall(PetscFree(A->defaultvectype));
1258   if (!flg) {
1259     PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1260   } else {
1261     PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1262   }
1263   A->boundtocpu = flg;
1264   PetscFunctionReturn(PETSC_SUCCESS);
1265 }
1266   #endif
1267 
1268 /*MC
1269      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1270 
1271    Options Database Key:
1272 .     -mat_type h2opus - matrix type to "h2opus"
1273 
1274    Level: beginner
1275 
1276    Notes:
1277      H2Opus implements hierarchical matrices in the H^2 flavour. It supports CPU or NVIDIA GPUs.
1278 
1279      For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1280      In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.
1281 
1282    Reference:
1283 .  * -  "H2Opus: A distributed-memory multi-GPU software package for non-local operators", https://arxiv.org/abs/2109.05451
1284 
1285 .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1286 M*/
1287 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1288 {
1289   Mat_H2OPUS *a;
1290   PetscMPIInt size;
1291 
1292   PetscFunctionBegin;
1293   #if defined(PETSC_H2OPUS_USE_GPU)
1294   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1295   #endif
1296   PetscCall(PetscNew(&a));
1297   A->data = (void *)a;
1298 
1299   a->eta              = 0.9;
1300   a->leafsize         = 32;
1301   a->basisord         = 4;
1302   a->max_rank         = 64;
1303   a->bs               = 32;
1304   a->rtol             = 1.e-4;
1305   a->s                = 1.0;
1306   a->norm_max_samples = 10;
1307   a->resize           = PETSC_TRUE; /* reallocate after compression */
1308   #if defined(H2OPUS_USE_MPI)
1309   h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1310   #else
1311   h2opusCreateHandle(&a->handle);
1312   #endif
1313   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1314   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1315   PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1316 
1317   A->ops->destroy          = MatDestroy_H2OPUS;
1318   A->ops->view             = MatView_H2OPUS;
1319   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1320   A->ops->mult             = MatMult_H2OPUS;
1321   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1322   A->ops->multadd          = MatMultAdd_H2OPUS;
1323   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1324   A->ops->scale            = MatScale_H2OPUS;
1325   A->ops->duplicate        = MatDuplicate_H2OPUS;
1326   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1327   A->ops->norm             = MatNorm_H2OPUS;
1328   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1329   #if defined(PETSC_H2OPUS_USE_GPU)
1330   A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1331   #endif
1332 
1333   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1334   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1335   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1336   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1337   #if defined(PETSC_H2OPUS_USE_GPU)
1338   PetscCall(PetscFree(A->defaultvectype));
1339   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1340   #endif
1341   PetscFunctionReturn(PETSC_SUCCESS);
1342 }
1343 
1344 /*@C
1345   MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1346 
1347   Input Parameter:
1348 . A - the matrix
1349 
1350   Level: intermediate
1351 
1352 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1353 @*/
1354 PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1355 {
1356   PetscBool   ish2opus;
1357   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1358   PetscMPIInt size;
1359   PetscBool   boundtocpu = PETSC_TRUE;
1360 
1361   PetscFunctionBegin;
1362   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1363   PetscValidType(A, 1);
1364   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1365   if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1366   if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1367   HLibProfile::clear();
1368   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1369   #if defined(PETSC_H2OPUS_USE_GPU)
1370   boundtocpu = A->boundtocpu;
1371   #endif
1372   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1373   if (size > 1) {
1374     if (boundtocpu) {
1375       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1376   #if defined(H2OPUS_USE_MPI)
1377       distributed_horthog(*a->dist_hmatrix, a->handle);
1378   #endif
1379   #if defined(PETSC_H2OPUS_USE_GPU)
1380       A->offloadmask = PETSC_OFFLOAD_CPU;
1381     } else {
1382       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1383       PetscCall(PetscLogGpuTimeBegin());
1384     #if defined(H2OPUS_USE_MPI)
1385       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1386     #endif
1387       PetscCall(PetscLogGpuTimeEnd());
1388   #endif
1389     }
1390   } else {
1391   #if defined(H2OPUS_USE_MPI)
1392     h2opusHandle_t handle = a->handle->handle;
1393   #else
1394     h2opusHandle_t handle = a->handle;
1395   #endif
1396     if (boundtocpu) {
1397       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1398       horthog(*a->hmatrix, handle);
1399   #if defined(PETSC_H2OPUS_USE_GPU)
1400       A->offloadmask = PETSC_OFFLOAD_CPU;
1401     } else {
1402       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1403       PetscCall(PetscLogGpuTimeBegin());
1404       horthog(*a->hmatrix_gpu, handle);
1405       PetscCall(PetscLogGpuTimeEnd());
1406   #endif
1407     }
1408   }
1409   a->orthogonal = PETSC_TRUE;
1410   { /* log flops */
1411     double gops, time, perf, dev;
1412     HLibProfile::getHorthogPerf(gops, time, perf, dev);
1413   #if defined(PETSC_H2OPUS_USE_GPU)
1414     if (boundtocpu) {
1415       PetscCall(PetscLogFlops(1e9 * gops));
1416     } else {
1417       PetscCall(PetscLogGpuFlops(1e9 * gops));
1418     }
1419   #else
1420     PetscCall(PetscLogFlops(1e9 * gops));
1421   #endif
1422   }
1423   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1424   PetscFunctionReturn(PETSC_SUCCESS);
1425 }
1426 
1427 /*@C
1428   MatH2OpusCompress - Compress a hierarchical matrix.
1429 
1430   Input Parameters:
1431 + A   - the matrix
1432 - tol - the absolute truncation threshold
1433 
1434   Level: intermediate
1435 
1436 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1437 @*/
1438 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1439 {
1440   PetscBool   ish2opus;
1441   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1442   PetscMPIInt size;
1443   PetscBool   boundtocpu = PETSC_TRUE;
1444 
1445   PetscFunctionBegin;
1446   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1447   PetscValidType(A, 1);
1448   PetscValidLogicalCollectiveReal(A, tol, 2);
1449   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1450   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1451   PetscCall(MatH2OpusOrthogonalize(A));
1452   HLibProfile::clear();
1453   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1454   #if defined(PETSC_H2OPUS_USE_GPU)
1455   boundtocpu = A->boundtocpu;
1456   #endif
1457   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1458   if (size > 1) {
1459     if (boundtocpu) {
1460       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1461   #if defined(H2OPUS_USE_MPI)
1462       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1463       if (a->resize) {
1464         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1465         delete a->dist_hmatrix;
1466         a->dist_hmatrix = dist_hmatrix;
1467       }
1468   #endif
1469   #if defined(PETSC_H2OPUS_USE_GPU)
1470       A->offloadmask = PETSC_OFFLOAD_CPU;
1471     } else {
1472       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1473       PetscCall(PetscLogGpuTimeBegin());
1474     #if defined(H2OPUS_USE_MPI)
1475       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1476 
1477       if (a->resize) {
1478         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1479         delete a->dist_hmatrix_gpu;
1480         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1481       }
1482     #endif
1483       PetscCall(PetscLogGpuTimeEnd());
1484   #endif
1485     }
1486   } else {
1487   #if defined(H2OPUS_USE_MPI)
1488     h2opusHandle_t handle = a->handle->handle;
1489   #else
1490     h2opusHandle_t handle = a->handle;
1491   #endif
1492     if (boundtocpu) {
1493       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1494       hcompress(*a->hmatrix, tol, handle);
1495 
1496       if (a->resize) {
1497         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1498         delete a->hmatrix;
1499         a->hmatrix = hmatrix;
1500       }
1501   #if defined(PETSC_H2OPUS_USE_GPU)
1502       A->offloadmask = PETSC_OFFLOAD_CPU;
1503     } else {
1504       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1505       PetscCall(PetscLogGpuTimeBegin());
1506       hcompress(*a->hmatrix_gpu, tol, handle);
1507       PetscCall(PetscLogGpuTimeEnd());
1508 
1509       if (a->resize) {
1510         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1511         delete a->hmatrix_gpu;
1512         a->hmatrix_gpu = hmatrix_gpu;
1513       }
1514   #endif
1515     }
1516   }
1517   { /* log flops */
1518     double gops, time, perf, dev;
1519     HLibProfile::getHcompressPerf(gops, time, perf, dev);
1520   #if defined(PETSC_H2OPUS_USE_GPU)
1521     if (boundtocpu) {
1522       PetscCall(PetscLogFlops(1e9 * gops));
1523     } else {
1524       PetscCall(PetscLogGpuFlops(1e9 * gops));
1525     }
1526   #else
1527     PetscCall(PetscLogFlops(1e9 * gops));
1528   #endif
1529   }
1530   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 
1534 /*@C
1535   MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.
1536 
1537   Input Parameters:
1538 + A   - the hierarchical matrix
1539 . B   - the matrix to be sampled
1540 . bs  - maximum number of samples to be taken concurrently
1541 - tol - relative tolerance for construction
1542 
1543   Notes:
1544   You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1545 
1546   Level: intermediate
1547 
1548 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1549 @*/
1550 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1551 {
1552   PetscBool ish2opus;
1553 
1554   PetscFunctionBegin;
1555   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1556   PetscValidType(A, 1);
1557   if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
1558   PetscValidLogicalCollectiveInt(A, bs, 3);
1559   PetscValidLogicalCollectiveReal(A, tol, 3);
1560   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1561   if (ish2opus) {
1562     Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1563 
1564     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1565     a->sampler->SetSamplingMat(B);
1566     if (bs > 0) a->bs = bs;
1567     if (tol > 0.) a->rtol = tol;
1568     delete a->kernel;
1569   }
1570   PetscFunctionReturn(PETSC_SUCCESS);
1571 }
1572 
1573 /*@C
1574   MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1575 
1576   Input Parameters:
1577 + comm      - MPI communicator
1578 . m         - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1579 . n         - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1580 . M         - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1581 . N         - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1582 . spacedim  - dimension of the space coordinates
1583 . coords    - coordinates of the points
1584 . cdist     - whether or not coordinates are distributed
1585 . kernel    - computational kernel (or `NULL`)
1586 . kernelctx - kernel context
1587 . eta       - admissibility condition tolerance
1588 . leafsize  - leaf size in cluster tree
1589 - basisord  - approximation order for Chebychev interpolation of low-rank blocks
1590 
1591   Output Parameter:
1592 . nA - matrix
1593 
1594   Options Database Keys:
1595 + -mat_h2opus_leafsize <`PetscInt`>    - Leaf size of cluster tree
1596 . -mat_h2opus_eta <`PetscReal`>        - Admissibility condition tolerance
1597 . -mat_h2opus_order <`PetscInt`>       - Chebychev approximation order
1598 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1599 
1600   Level: intermediate
1601 
1602 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1603 @*/
1604 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1605 {
1606   Mat         A;
1607   Mat_H2OPUS *h2opus;
1608   #if defined(PETSC_H2OPUS_USE_GPU)
1609   PetscBool iscpu = PETSC_FALSE;
1610   #else
1611   PetscBool iscpu = PETSC_TRUE;
1612   #endif
1613 
1614   PetscFunctionBegin;
1615   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1616   PetscCall(MatCreate(comm, &A));
1617   PetscCall(MatSetSizes(A, m, n, M, N));
1618   PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1619   PetscCall(MatSetType(A, MATH2OPUS));
1620   PetscCall(MatBindToCPU(A, iscpu));
1621   PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1622 
1623   h2opus = (Mat_H2OPUS *)A->data;
1624   if (eta > 0.) h2opus->eta = eta;
1625   if (leafsize > 0) h2opus->leafsize = leafsize;
1626   if (basisord > 0) h2opus->basisord = basisord;
1627 
1628   *nA = A;
1629   PetscFunctionReturn(PETSC_SUCCESS);
1630 }
1631 
1632 /*@C
1633   MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1634 
1635   Input Parameters:
1636 + B        - the matrix to be sampled
1637 . spacedim - dimension of the space coordinates
1638 . coords   - coordinates of the points
1639 . cdist    - whether or not coordinates are distributed
1640 . eta      - admissibility condition tolerance
1641 . leafsize - leaf size in cluster tree
1642 . maxrank  - maximum rank allowed
1643 . bs       - maximum number of samples to be taken concurrently
1644 - rtol     - relative tolerance for construction
1645 
1646   Output Parameter:
1647 . nA - matrix
1648 
1649   Options Database Keys:
1650 + -mat_h2opus_leafsize <`PetscInt`>      - Leaf size of cluster tree
1651 . -mat_h2opus_eta <`PetscReal`>          - Admissibility condition tolerance
1652 . -mat_h2opus_maxrank <`PetscInt`>       - Maximum rank when constructed from matvecs
1653 . -mat_h2opus_samples <`PetscInt`>       - Maximum number of samples to be taken concurrently when constructing from matvecs
1654 . -mat_h2opus_rtol <`PetscReal`>         - Relative tolerance for construction from sampling
1655 . -mat_h2opus_check <`PetscBool`>        - Check error when constructing from sampling during MatAssemblyEnd()
1656 . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1657 - -mat_h2opus_normsamples <`PetscInt`>   - Maximum number of samples to be when estimating norms
1658 
1659   Level: intermediate
1660 
1661   Note:
1662   Not available in parallel
1663 
1664 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1665 @*/
1666 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1667 {
1668   Mat         A;
1669   Mat_H2OPUS *h2opus;
1670   MPI_Comm    comm;
1671   PetscBool   boundtocpu = PETSC_TRUE;
1672 
1673   PetscFunctionBegin;
1674   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
1675   PetscValidLogicalCollectiveInt(B, spacedim, 2);
1676   PetscValidLogicalCollectiveBool(B, cdist, 4);
1677   PetscValidLogicalCollectiveReal(B, eta, 5);
1678   PetscValidLogicalCollectiveInt(B, leafsize, 6);
1679   PetscValidLogicalCollectiveInt(B, maxrank, 7);
1680   PetscValidLogicalCollectiveInt(B, bs, 8);
1681   PetscValidLogicalCollectiveReal(B, rtol, 9);
1682   PetscAssertPointer(nA, 10);
1683   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1684   PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1685   PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1686   PetscCall(MatCreate(comm, &A));
1687   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1688   #if defined(PETSC_H2OPUS_USE_GPU)
1689   {
1690     VecType   vtype;
1691     PetscBool isstd, iscuda, iskok;
1692 
1693     PetscCall(MatGetVecType(B, &vtype));
1694     PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1695     PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1696     PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1697     PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1698     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1699     if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1700   }
1701   #endif
1702   PetscCall(MatSetType(A, MATH2OPUS));
1703   PetscCall(MatBindToCPU(A, boundtocpu));
1704   if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1705   PetscCall(MatPropagateSymmetryOptions(B, A));
1706   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1707 
1708   h2opus          = (Mat_H2OPUS *)A->data;
1709   h2opus->sampler = new PetscMatrixSampler(B);
1710   if (eta > 0.) h2opus->eta = eta;
1711   if (leafsize > 0) h2opus->leafsize = leafsize;
1712   if (maxrank > 0) h2opus->max_rank = maxrank;
1713   if (bs > 0) h2opus->bs = bs;
1714   if (rtol > 0.) h2opus->rtol = rtol;
1715   *nA             = A;
1716   A->preallocated = PETSC_TRUE;
1717   PetscFunctionReturn(PETSC_SUCCESS);
1718 }
1719 
1720 /*@C
1721   MatH2OpusGetIndexMap - Access reordering index set.
1722 
1723   Input Parameter:
1724 . A - the matrix
1725 
1726   Output Parameter:
1727 . indexmap - the index set for the reordering
1728 
1729   Level: intermediate
1730 
1731 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1732 @*/
1733 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1734 {
1735   PetscBool   ish2opus;
1736   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1737 
1738   PetscFunctionBegin;
1739   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1740   PetscValidType(A, 1);
1741   PetscAssertPointer(indexmap, 2);
1742   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1743   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1744   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1745   *indexmap = a->h2opus_indexmap;
1746   PetscFunctionReturn(PETSC_SUCCESS);
1747 }
1748 
1749 /*@C
1750   MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1751 
1752   Input Parameters:
1753 + A             - the matrix
1754 . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1755 - in            - the vector to be mapped
1756 
1757   Output Parameter:
1758 . out - the newly created mapped vector
1759 
1760   Level: intermediate
1761 
1762 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1763 @*/
1764 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1765 {
1766   PetscBool    ish2opus;
1767   Mat_H2OPUS  *a = (Mat_H2OPUS *)A->data;
1768   PetscScalar *xin, *xout;
1769   PetscBool    nm;
1770 
1771   PetscFunctionBegin;
1772   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1773   PetscValidType(A, 1);
1774   PetscValidLogicalCollectiveBool(A, nativetopetsc, 2);
1775   PetscValidHeaderSpecific(in, VEC_CLASSID, 3);
1776   PetscAssertPointer(out, 4);
1777   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1778   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1779   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1780   nm = a->nativemult;
1781   PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1782   PetscCall(MatCreateVecs(A, out, NULL));
1783   PetscCall(MatH2OpusSetNativeMult(A, nm));
1784   if (!a->sf) { /* same ordering */
1785     PetscCall(VecCopy(in, *out));
1786     PetscFunctionReturn(PETSC_SUCCESS);
1787   }
1788   PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1789   PetscCall(VecGetArrayWrite(*out, &xout));
1790   if (nativetopetsc) {
1791     PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1792     PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1793   } else {
1794     PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1795     PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1796   }
1797   PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1798   PetscCall(VecRestoreArrayWrite(*out, &xout));
1799   PetscFunctionReturn(PETSC_SUCCESS);
1800 }
1801 
1802 /*@C
1803   MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1804 
1805   Input Parameters:
1806 + A - the hierarchical `MATH2OPUS` matrix
1807 . s - the scaling factor
1808 . U - the dense low-rank update matrix
1809 - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)
1810 
1811   Note:
1812   The `U` and `V` matrices must be in dense format
1813 
1814   Level: intermediate
1815 
1816 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1817 @*/
1818 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1819 {
1820   PetscBool flg;
1821 
1822   PetscFunctionBegin;
1823   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1824   PetscValidType(A, 1);
1825   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1826   PetscValidHeaderSpecific(U, MAT_CLASSID, 2);
1827   PetscCheckSameComm(A, 1, U, 2);
1828   if (V) {
1829     PetscValidHeaderSpecific(V, MAT_CLASSID, 3);
1830     PetscCheckSameComm(A, 1, V, 3);
1831   }
1832   PetscValidLogicalCollectiveScalar(A, s, 4);
1833 
1834   if (!V) V = U;
1835   PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1836   if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1837   PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1838   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1839   PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1840   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1841   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1842   if (flg) {
1843     Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
1844     const PetscScalar *u, *v, *uu, *vv;
1845     PetscInt           ldu, ldv;
1846     PetscMPIInt        size;
1847   #if defined(H2OPUS_USE_MPI)
1848     h2opusHandle_t handle = a->handle->handle;
1849   #else
1850     h2opusHandle_t handle = a->handle;
1851   #endif
1852     PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1853     PetscSF   usf, vsf;
1854 
1855     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1856     PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1857     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1858     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1859     PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1860     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1861     PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1862     PetscCall(MatDenseGetLDA(U, &ldu));
1863     PetscCall(MatDenseGetLDA(V, &ldv));
1864     PetscCall(MatBoundToCPU(A, &flg));
1865     if (usesf) {
1866       PetscInt n;
1867 
1868       PetscCall(MatDenseGetH2OpusVectorSF(U, a->sf, &usf));
1869       PetscCall(MatDenseGetH2OpusVectorSF(V, a->sf, &vsf));
1870       PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1871       PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1872       ldu = n;
1873       ldv = n;
1874     }
1875     if (flg) {
1876       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1877       PetscCall(MatDenseGetArrayRead(U, &u));
1878       PetscCall(MatDenseGetArrayRead(V, &v));
1879       if (usesf) {
1880         vv = MatH2OpusGetThrustPointer(*a->yy);
1881         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1882         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1883         if (U != V) {
1884           uu = MatH2OpusGetThrustPointer(*a->xx);
1885           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1886           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1887         } else uu = vv;
1888       } else {
1889         uu = u;
1890         vv = v;
1891       }
1892       hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1893       PetscCall(MatDenseRestoreArrayRead(U, &u));
1894       PetscCall(MatDenseRestoreArrayRead(V, &v));
1895     } else {
1896   #if defined(PETSC_H2OPUS_USE_GPU)
1897       PetscBool flgU, flgV;
1898 
1899       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1900       PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1901       if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1902       PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1903       if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1904       PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1905       PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1906       if (usesf) {
1907         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1908         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1909         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1910         if (U != V) {
1911           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1912           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1913           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1914         } else uu = vv;
1915       } else {
1916         uu = u;
1917         vv = v;
1918       }
1919   #else
1920       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1921   #endif
1922       hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1923   #if defined(PETSC_H2OPUS_USE_GPU)
1924       PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1925       PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1926       if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1927       if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1928   #endif
1929     }
1930     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1931     a->orthogonal = PETSC_FALSE;
1932   }
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 #endif
1936