xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision ae1ee55146a7ad071171b861759b85940c7e5c67)
1 #include <h2opusconf.h>
2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4   #include <h2opus.h>
5   #if defined(H2OPUS_USE_MPI)
6     #include <h2opus/distributed/distributed_h2opus_handle.h>
7     #include <h2opus/distributed/distributed_geometric_construction.h>
8     #include <h2opus/distributed/distributed_hgemv.h>
9     #include <h2opus/distributed/distributed_horthog.h>
10     #include <h2opus/distributed/distributed_hcompress.h>
11   #endif
12   #include <h2opus/util/boxentrygen.h>
13   #include <petsc/private/matimpl.h>
14   #include <petsc/private/vecimpl.h>
15   #include <petsc/private/deviceimpl.h> /*I "petscmat.h"   I*/
16   #include <petscsf.h>
17 
18 /* math2opusutils */
19 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusStridedSF(Mat, PetscSF, PetscSF *);
20 PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
21 PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
22 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
23 
24   #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
25 
26   /* Use GPU only if H2OPUS is configured for GPU */
27   #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
28     #define PETSC_H2OPUS_USE_GPU
29   #endif
30   #if defined(PETSC_H2OPUS_USE_GPU)
31     #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
32   #else
33     #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
34   #endif
35 
36 // TODO H2OPUS:
37 // DistributedHMatrix
38 //   unsymmetric ?
39 //   transpose for distributed_hgemv?
40 //   clearData()
41 // Unify interface for sequential and parallel?
42 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
43 //
44 template <class T>
45 class PetscPointCloud : public H2OpusDataSet<T> {
46 private:
47   int            dimension;
48   size_t         num_points;
49   std::vector<T> pts;
50 
51 public:
PetscPointCloud(int dim,size_t num_pts,const T coords[])52   PetscPointCloud(int dim, size_t num_pts, const T coords[])
53   {
54     dim              = dim > 0 ? dim : 1;
55     this->dimension  = dim;
56     this->num_points = num_pts;
57 
58     pts.resize(num_pts * dim);
59     if (coords) {
60       for (size_t n = 0; n < num_pts; n++)
61         for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
62     } else {
63       PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
64       for (size_t n = 0; n < num_pts; n++) {
65         pts[n * dim] = n * h;
66         for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
67       }
68     }
69   }
70 
PetscPointCloud(const PetscPointCloud<T> & other)71   PetscPointCloud(const PetscPointCloud<T> &other)
72   {
73     size_t N         = other.dimension * other.num_points;
74     this->dimension  = other.dimension;
75     this->num_points = other.num_points;
76     this->pts.resize(N);
77     for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
78   }
79 
getDimension() const80   int getDimension() const { return dimension; }
81 
getDataSetSize() const82   size_t getDataSetSize() const { return num_points; }
83 
getDataPoint(size_t idx,int dim) const84   T getDataPoint(size_t idx, int dim) const
85   {
86     assert(dim < dimension && idx < num_points);
87     return pts[idx * dimension + dim];
88   }
89 
Print(std::ostream & out=std::cout)90   void Print(std::ostream &out = std::cout)
91   {
92     out << "Dimension: " << dimension << std::endl;
93     out << "NumPoints: " << num_points << std::endl;
94     for (size_t n = 0; n < num_points; n++) {
95       for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
96       out << std::endl;
97     }
98   }
99 };
100 
101 template <class T>
102 class PetscFunctionGenerator {
103 private:
104   MatH2OpusKernelFn *k;
105   int                dim;
106   void              *ctx;
107 
108 public:
PetscFunctionGenerator(MatH2OpusKernelFn * k,int dim,PetscCtx ctx)109   PetscFunctionGenerator(MatH2OpusKernelFn *k, int dim, PetscCtx ctx)
110   {
111     this->k   = k;
112     this->dim = dim;
113     this->ctx = ctx;
114   }
PetscFunctionGenerator(PetscFunctionGenerator & other)115   PetscFunctionGenerator(PetscFunctionGenerator &other)
116   {
117     this->k   = other.k;
118     this->dim = other.dim;
119     this->ctx = other.ctx;
120   }
operator ()(PetscReal * pt1,PetscReal * pt2)121   T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
122 };
123 
124   #include <../src/mat/impls/h2opus/math2opussampler.hpp>
125 
126   /* just to not clutter the code */
127   #if !defined(H2OPUS_USE_GPU)
128 typedef HMatrix HMatrix_GPU;
129     #if defined(H2OPUS_USE_MPI)
130 typedef DistributedHMatrix DistributedHMatrix_GPU;
131     #endif
132   #endif
133 
134 typedef struct {
135   #if defined(H2OPUS_USE_MPI)
136   distributedH2OpusHandle_t handle;
137   #else
138   h2opusHandle_t handle;
139   #endif
140   /* Sequential and parallel matrices are two different classes at the moment */
141   HMatrix *hmatrix;
142   #if defined(H2OPUS_USE_MPI)
143   DistributedHMatrix *dist_hmatrix;
144   #else
145   HMatrix *dist_hmatrix; /* just to not clutter the code */
146   #endif
147   /* May use permutations */
148   PetscSF                           sf;
149   PetscLayout                       h2opus_rmap, h2opus_cmap;
150   IS                                h2opus_indexmap;
151   thrust::host_vector<PetscScalar> *xx, *yy;
152   PetscInt                          xxs, yys;
153   PetscBool                         multsetup;
154 
155   /* GPU */
156   HMatrix_GPU *hmatrix_gpu;
157   #if defined(H2OPUS_USE_MPI)
158   DistributedHMatrix_GPU *dist_hmatrix_gpu;
159   #else
160   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
161   #endif
162   #if defined(PETSC_H2OPUS_USE_GPU)
163   thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
164   PetscInt                            xxs_gpu, yys_gpu;
165   #endif
166 
167   /* construction from matvecs */
168   PetscMatrixSampler *sampler;
169   PetscBool           nativemult;
170 
171   /* Admissibility */
172   PetscReal eta;
173   PetscInt  leafsize;
174 
175   /* for dof reordering */
176   PetscPointCloud<PetscReal> *ptcloud;
177 
178   /* kernel for generating matrix entries */
179   PetscFunctionGenerator<PetscScalar> *kernel;
180 
181   /* basis orthogonalized? */
182   PetscBool orthogonal;
183 
184   /* customization */
185   PetscInt  basisord;
186   PetscInt  max_rank;
187   PetscInt  bs;
188   PetscReal rtol;
189   PetscInt  norm_max_samples;
190   PetscBool check_construction;
191   PetscBool hara_verbose;
192   PetscBool resize;
193 
194   /* keeps track of MatScale values */
195   PetscScalar s;
196 } Mat_H2OPUS;
197 
MatDestroy_H2OPUS(Mat A)198 static PetscErrorCode MatDestroy_H2OPUS(Mat A)
199 {
200   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
201 
202   PetscFunctionBegin;
203   #if defined(H2OPUS_USE_MPI)
204   h2opusDestroyDistributedHandle(a->handle);
205   #else
206   h2opusDestroyHandle(a->handle);
207   #endif
208   delete a->dist_hmatrix;
209   delete a->hmatrix;
210   PetscCall(PetscSFDestroy(&a->sf));
211   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
212   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
213   PetscCall(ISDestroy(&a->h2opus_indexmap));
214   delete a->xx;
215   delete a->yy;
216   delete a->hmatrix_gpu;
217   delete a->dist_hmatrix_gpu;
218   #if defined(PETSC_H2OPUS_USE_GPU)
219   delete a->xx_gpu;
220   delete a->yy_gpu;
221   #endif
222   delete a->sampler;
223   delete a->ptcloud;
224   delete a->kernel;
225   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
226   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
227   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
228   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
229   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
230   PetscCall(PetscFree(A->data));
231   PetscFunctionReturn(PETSC_SUCCESS);
232 }
233 
MatH2OpusSetNativeMult(Mat A,PetscBool nm)234 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235 {
236   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
237   PetscBool   ish2opus;
238 
239   PetscFunctionBegin;
240   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
241   PetscValidLogicalCollectiveBool(A, nm, 2);
242   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
243   if (ish2opus) {
244     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
245       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
246         PetscLayout t;
247         t              = A->rmap;
248         A->rmap        = a->h2opus_rmap;
249         a->h2opus_rmap = t;
250         t              = A->cmap;
251         A->cmap        = a->h2opus_cmap;
252         a->h2opus_cmap = t;
253       }
254     }
255     a->nativemult = nm;
256   }
257   PetscFunctionReturn(PETSC_SUCCESS);
258 }
259 
MatH2OpusGetNativeMult(Mat A,PetscBool * nm)260 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
261 {
262   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
263   PetscBool   ish2opus;
264 
265   PetscFunctionBegin;
266   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
267   PetscAssertPointer(nm, 2);
268   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
269   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
270   *nm = a->nativemult;
271   PetscFunctionReturn(PETSC_SUCCESS);
272 }
273 
MatNorm_H2OPUS(Mat A,NormType normtype,PetscReal * n)274 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
275 {
276   PetscBool   ish2opus;
277   PetscInt    nmax = PETSC_DECIDE;
278   Mat_H2OPUS *a    = NULL;
279   PetscBool   mult = PETSC_FALSE;
280 
281   PetscFunctionBegin;
282   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
283   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
284     a = (Mat_H2OPUS *)A->data;
285 
286     nmax = a->norm_max_samples;
287     mult = a->nativemult;
288     PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
289   } else {
290     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
291   }
292   PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
293   if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
294   PetscFunctionReturn(PETSC_SUCCESS);
295 }
296 
MatH2OpusResizeBuffers_Private(Mat A,PetscInt xN,PetscInt yN)297 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
298 {
299   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
300   PetscInt    n;
301   PetscBool   boundtocpu = PETSC_TRUE;
302 
303   PetscFunctionBegin;
304   #if defined(PETSC_H2OPUS_USE_GPU)
305   boundtocpu = A->boundtocpu;
306   #endif
307   PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
308   if (boundtocpu) {
309     if (h2opus->xxs < xN) {
310       h2opus->xx->resize(n * xN);
311       h2opus->xxs = xN;
312     }
313     if (h2opus->yys < yN) {
314       h2opus->yy->resize(n * yN);
315       h2opus->yys = yN;
316     }
317   }
318   #if defined(PETSC_H2OPUS_USE_GPU)
319   if (!boundtocpu) {
320     if (h2opus->xxs_gpu < xN) {
321       h2opus->xx_gpu->resize(n * xN);
322       h2opus->xxs_gpu = xN;
323     }
324     if (h2opus->yys_gpu < yN) {
325       h2opus->yy_gpu->resize(n * yN);
326       h2opus->yys_gpu = yN;
327     }
328   }
329   #endif
330   PetscFunctionReturn(PETSC_SUCCESS);
331 }
332 
MatMultNKernel_H2OPUS(Mat A,PetscBool transA,Mat B,Mat C)333 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
334 {
335   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
336   #if defined(H2OPUS_USE_MPI)
337   h2opusHandle_t handle = h2opus->handle->handle;
338   #else
339   h2opusHandle_t handle = h2opus->handle;
340   #endif
341   PetscBool    boundtocpu = PETSC_TRUE;
342   PetscScalar *xx, *yy, *uxx, *uyy;
343   PetscInt     blda, clda;
344   PetscMPIInt  size;
345   PetscSF      bsf, csf;
346   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
347 
348   PetscFunctionBegin;
349   HLibProfile::clear();
350   #if defined(PETSC_H2OPUS_USE_GPU)
351   boundtocpu = A->boundtocpu;
352   #endif
353   PetscCall(MatDenseGetLDA(B, &blda));
354   PetscCall(MatDenseGetLDA(C, &clda));
355   if (usesf) {
356     PetscInt n;
357 
358     PetscCall(MatDenseGetH2OpusStridedSF(B, h2opus->sf, &bsf));
359     PetscCall(MatDenseGetH2OpusStridedSF(C, h2opus->sf, &csf));
360 
361     PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
362     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
363     blda = n;
364     clda = n;
365   }
366   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
367   if (boundtocpu) {
368     PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
369     PetscCall(MatDenseGetArrayWrite(C, &yy));
370     if (usesf) {
371       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
372       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
373       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
374       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375     } else {
376       uxx = xx;
377       uyy = yy;
378     }
379     if (size > 1) {
380       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
381       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
382   #if defined(H2OPUS_USE_MPI)
383       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
384   #endif
385     } else {
386       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
387       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
388     }
389     PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
390     if (usesf) {
391       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
392       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393     }
394     PetscCall(MatDenseRestoreArrayWrite(C, &yy));
395   #if defined(PETSC_H2OPUS_USE_GPU)
396   } else {
397     PetscBool ciscuda, biscuda;
398 
399     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
400     PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
401     if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
402     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
403     if (!ciscuda) {
404       C->assembled = PETSC_TRUE;
405       PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
406     }
407     PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
408     PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
409     if (usesf) {
410       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
411       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
412       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
413       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414     } else {
415       uxx = xx;
416       uyy = yy;
417     }
418     PetscCall(PetscLogGpuTimeBegin());
419     if (size > 1) {
420       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
421       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
422     #if defined(H2OPUS_USE_MPI)
423       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
424     #endif
425     } else {
426       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
427       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
428     }
429     PetscCall(PetscLogGpuTimeEnd());
430     PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
431     if (usesf) {
432       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
433       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434     }
435     PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
436     if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
437     if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
438   #endif
439   }
440   { /* log flops */
441     double gops, time, perf, dev;
442     HLibProfile::getHgemvPerf(gops, time, perf, dev);
443   #if defined(PETSC_H2OPUS_USE_GPU)
444     if (boundtocpu) {
445       PetscCall(PetscLogFlops(1e9 * gops));
446     } else {
447       PetscCall(PetscLogGpuFlops(1e9 * gops));
448     }
449   #else
450     PetscCall(PetscLogFlops(1e9 * gops));
451   #endif
452   }
453   PetscFunctionReturn(PETSC_SUCCESS);
454 }
455 
MatProductNumeric_H2OPUS(Mat C)456 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
457 {
458   Mat_Product *product = C->product;
459 
460   PetscFunctionBegin;
461   MatCheckProduct(C, 1);
462   switch (product->type) {
463   case MATPRODUCT_AB:
464     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
465     break;
466   case MATPRODUCT_AtB:
467     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
468     break;
469   default:
470     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
471   }
472   PetscFunctionReturn(PETSC_SUCCESS);
473 }
474 
MatProductSymbolic_H2OPUS(Mat C)475 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
476 {
477   Mat_Product *product = C->product;
478   PetscBool    cisdense;
479   Mat          A, B;
480 
481   PetscFunctionBegin;
482   MatCheckProduct(C, 1);
483   A = product->A;
484   B = product->B;
485   switch (product->type) {
486   case MATPRODUCT_AB:
487     PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
488     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
489     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
490     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
491     PetscCall(MatSetUp(C));
492     break;
493   case MATPRODUCT_AtB:
494     PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
495     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
496     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
497     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
498     PetscCall(MatSetUp(C));
499     break;
500   default:
501     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
502   }
503   C->ops->productsymbolic = NULL;
504   C->ops->productnumeric  = MatProductNumeric_H2OPUS;
505   PetscFunctionReturn(PETSC_SUCCESS);
506 }
507 
MatProductSetFromOptions_H2OPUS(Mat C)508 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
509 {
510   PetscFunctionBegin;
511   MatCheckProduct(C, 1);
512   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
513   PetscFunctionReturn(PETSC_SUCCESS);
514 }
515 
MatMultKernel_H2OPUS(Mat A,Vec x,PetscScalar sy,Vec y,PetscBool trans)516 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517 {
518   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
519   #if defined(H2OPUS_USE_MPI)
520   h2opusHandle_t handle = h2opus->handle->handle;
521   #else
522   h2opusHandle_t handle = h2opus->handle;
523   #endif
524   PetscBool    boundtocpu = PETSC_TRUE;
525   PetscInt     n;
526   PetscScalar *xx, *yy, *uxx, *uyy;
527   PetscMPIInt  size;
528   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
529 
530   PetscFunctionBegin;
531   HLibProfile::clear();
532   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
533   #if defined(PETSC_H2OPUS_USE_GPU)
534   boundtocpu = A->boundtocpu;
535   #endif
536   if (usesf) PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
537   else n = A->rmap->n;
538   if (boundtocpu) {
539     PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
540     if (sy == 0.0) {
541       PetscCall(VecGetArrayWrite(y, &yy));
542     } else {
543       PetscCall(VecGetArray(y, &yy));
544     }
545     if (usesf) {
546       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
547       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
548 
549       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
550       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
551       if (sy != 0.0) {
552         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
553         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
554       }
555     } else {
556       uxx = xx;
557       uyy = yy;
558     }
559     if (size > 1) {
560       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
561       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
562   #if defined(H2OPUS_USE_MPI)
563       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
564   #endif
565     } else {
566       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
567       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
568     }
569     PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
570     if (usesf) {
571       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
572       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
573     }
574     if (sy == 0.0) {
575       PetscCall(VecRestoreArrayWrite(y, &yy));
576     } else {
577       PetscCall(VecRestoreArray(y, &yy));
578     }
579   #if defined(PETSC_H2OPUS_USE_GPU)
580   } else {
581     PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
582     if (sy == 0.0) {
583       PetscCall(VecCUDAGetArrayWrite(y, &yy));
584     } else {
585       PetscCall(VecCUDAGetArray(y, &yy));
586     }
587     if (usesf) {
588       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
589       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
590 
591       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
592       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
593       if (sy != 0.0) {
594         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
595         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
596       }
597     } else {
598       uxx = xx;
599       uyy = yy;
600     }
601     PetscCall(PetscLogGpuTimeBegin());
602     if (size > 1) {
603       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
604       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
605     #if defined(H2OPUS_USE_MPI)
606       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
607     #endif
608     } else {
609       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
610       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
611     }
612     PetscCall(PetscLogGpuTimeEnd());
613     PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
614     if (usesf) {
615       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
616       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
617     }
618     if (sy == 0.0) {
619       PetscCall(VecCUDARestoreArrayWrite(y, &yy));
620     } else {
621       PetscCall(VecCUDARestoreArray(y, &yy));
622     }
623   #endif
624   }
625   { /* log flops */
626     double gops, time, perf, dev;
627     HLibProfile::getHgemvPerf(gops, time, perf, dev);
628   #if defined(PETSC_H2OPUS_USE_GPU)
629     if (boundtocpu) {
630       PetscCall(PetscLogFlops(1e9 * gops));
631     } else {
632       PetscCall(PetscLogGpuFlops(1e9 * gops));
633     }
634   #else
635     PetscCall(PetscLogFlops(1e9 * gops));
636   #endif
637   }
638   PetscFunctionReturn(PETSC_SUCCESS);
639 }
640 
MatMultTranspose_H2OPUS(Mat A,Vec x,Vec y)641 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
642 {
643   PetscBool xiscuda, yiscuda;
644 
645   PetscFunctionBegin;
646   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
647   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
648   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
649   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
650   PetscFunctionReturn(PETSC_SUCCESS);
651 }
652 
MatMult_H2OPUS(Mat A,Vec x,Vec y)653 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
654 {
655   PetscBool xiscuda, yiscuda;
656 
657   PetscFunctionBegin;
658   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
659   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
660   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
661   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
662   PetscFunctionReturn(PETSC_SUCCESS);
663 }
664 
MatMultTransposeAdd_H2OPUS(Mat A,Vec x,Vec y,Vec z)665 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
666 {
667   PetscBool xiscuda, ziscuda;
668 
669   PetscFunctionBegin;
670   PetscCall(VecCopy(y, z));
671   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
672   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
673   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
674   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
675   PetscFunctionReturn(PETSC_SUCCESS);
676 }
677 
MatMultAdd_H2OPUS(Mat A,Vec x,Vec y,Vec z)678 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
679 {
680   PetscBool xiscuda, ziscuda;
681 
682   PetscFunctionBegin;
683   PetscCall(VecCopy(y, z));
684   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
685   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
686   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
687   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
688   PetscFunctionReturn(PETSC_SUCCESS);
689 }
690 
MatScale_H2OPUS(Mat A,PetscScalar s)691 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
692 {
693   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
694 
695   PetscFunctionBegin;
696   a->s *= s;
697   PetscFunctionReturn(PETSC_SUCCESS);
698 }
699 
MatSetFromOptions_H2OPUS(Mat A,PetscOptionItems PetscOptionsObject)700 static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems PetscOptionsObject)
701 {
702   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
703 
704   PetscFunctionBegin;
705   PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
706   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
707   PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
708   PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
709   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
710   PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
711   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
712   PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
713   PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
714   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
715   PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
716   PetscOptionsHeadEnd();
717   PetscFunctionReturn(PETSC_SUCCESS);
718 }
719 
720 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernelFn *, void *);
721 
MatH2OpusInferCoordinates_Private(Mat A)722 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
723 {
724   Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
725   Vec                c;
726   PetscInt           spacedim;
727   const PetscScalar *coords;
728 
729   PetscFunctionBegin;
730   if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
731   PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
732   if (!c && a->sampler) {
733     Mat S = a->sampler->GetSamplingMat();
734 
735     PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
736   }
737   if (!c) {
738     PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
739   } else {
740     PetscCall(VecGetArrayRead(c, &coords));
741     PetscCall(VecGetBlockSize(c, &spacedim));
742     PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
743     PetscCall(VecRestoreArrayRead(c, &coords));
744   }
745   PetscFunctionReturn(PETSC_SUCCESS);
746 }
747 
MatSetUpMultiply_H2OPUS(Mat A)748 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
749 {
750   MPI_Comm      comm;
751   PetscMPIInt   size;
752   Mat_H2OPUS   *a = (Mat_H2OPUS *)A->data;
753   PetscInt      n = 0, *idx = NULL;
754   int          *iidx = NULL;
755   PetscCopyMode own;
756   PetscBool     rid;
757 
758   PetscFunctionBegin;
759   if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
760   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
761     PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
762   #if defined(PETSC_H2OPUS_USE_GPU)
763     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
764     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
765     a->xxs_gpu = 1;
766     a->yys_gpu = 1;
767   #endif
768     a->xx  = new thrust::host_vector<PetscScalar>(n);
769     a->yy  = new thrust::host_vector<PetscScalar>(n);
770     a->xxs = 1;
771     a->yys = 1;
772   } else {
773     IS is;
774     PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
775     PetscCallMPI(MPI_Comm_size(comm, &size));
776     if (!a->h2opus_indexmap) {
777       if (size > 1) {
778         PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
779   #if defined(H2OPUS_USE_MPI)
780         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
781         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
782   #endif
783       } else {
784         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
785         n    = a->hmatrix->u_basis_tree.index_map.size();
786       }
787 
788       if (PetscDefined(USE_64BIT_INDICES)) {
789         PetscInt i;
790 
791         own = PETSC_OWN_POINTER;
792         PetscCall(PetscMalloc1(n, &idx));
793         for (i = 0; i < n; i++) idx[i] = iidx[i];
794       } else {
795         own = PETSC_COPY_VALUES;
796         idx = (PetscInt *)iidx;
797       }
798       PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
799       PetscCall(ISSetPermutation(is));
800       PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
801       a->h2opus_indexmap = is;
802     }
803     PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
804     PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
805     rid = (PetscBool)(n == A->rmap->n);
806     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPI_C_BOOL, MPI_LAND, comm));
807     if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
808     if (!rid) {
809       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
810         PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
811         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
812         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
813         PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
814       }
815       PetscCall(PetscSFCreate(comm, &a->sf));
816       PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
817       PetscCall(PetscSFSetUp(a->sf));
818       PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
819   #if defined(PETSC_H2OPUS_USE_GPU)
820       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
821       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
822       a->xxs_gpu = 1;
823       a->yys_gpu = 1;
824   #endif
825       a->xx  = new thrust::host_vector<PetscScalar>(n);
826       a->yy  = new thrust::host_vector<PetscScalar>(n);
827       a->xxs = 1;
828       a->yys = 1;
829     }
830     PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
831   }
832   a->multsetup = PETSC_TRUE;
833   PetscFunctionReturn(PETSC_SUCCESS);
834 }
835 
MatAssemblyEnd_H2OPUS(Mat A,MatAssemblyType assemblytype)836 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
837 {
838   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
839   #if defined(H2OPUS_USE_MPI)
840   h2opusHandle_t handle = a->handle->handle;
841   #else
842   h2opusHandle_t handle = a->handle;
843   #endif
844   PetscBool   kernel       = PETSC_FALSE;
845   PetscBool   boundtocpu   = PETSC_TRUE;
846   PetscBool   samplingdone = PETSC_FALSE;
847   MPI_Comm    comm;
848   PetscMPIInt size;
849 
850   PetscFunctionBegin;
851   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
852   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
853   PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
854 
855   /* XXX */
856   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
857 
858   PetscCallMPI(MPI_Comm_size(comm, &size));
859   /* TODO REUSABILITY of geometric construction */
860   delete a->hmatrix;
861   delete a->dist_hmatrix;
862   #if defined(PETSC_H2OPUS_USE_GPU)
863   delete a->hmatrix_gpu;
864   delete a->dist_hmatrix_gpu;
865   #endif
866   a->orthogonal = PETSC_FALSE;
867 
868   /* TODO: other? */
869   H2OpusBoxCenterAdmissibility adm(a->eta);
870 
871   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
872   if (size > 1) {
873   #if defined(H2OPUS_USE_MPI)
874     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
875   #else
876     a->dist_hmatrix = NULL;
877   #endif
878   } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
879   PetscCall(MatH2OpusInferCoordinates_Private(A));
880   PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
881   if (a->kernel) {
882     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
883     if (size > 1) {
884       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
885   #if defined(H2OPUS_USE_MPI)
886       buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
887   #endif
888     } else {
889       buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
890     }
891     kernel = PETSC_TRUE;
892   } else {
893     PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
894     buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
895   }
896   PetscCall(MatSetUpMultiply_H2OPUS(A));
897 
898   #if defined(PETSC_H2OPUS_USE_GPU)
899   boundtocpu = A->boundtocpu;
900   if (!boundtocpu) {
901     if (size > 1) {
902       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
903     #if defined(H2OPUS_USE_MPI)
904       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
905     #endif
906     } else {
907       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
908     }
909   }
910   #endif
911   if (size == 1) {
912     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
913       PetscReal Anorm;
914       bool      verbose;
915 
916       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
917       verbose = a->hara_verbose;
918       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
919       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
920       if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
921       a->sampler->SetStream(handle->getMainStream());
922       if (boundtocpu) {
923         a->sampler->SetGPUSampling(false);
924         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
925   #if defined(PETSC_H2OPUS_USE_GPU)
926       } else {
927         a->sampler->SetGPUSampling(true);
928         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
929   #endif
930       }
931       samplingdone = PETSC_TRUE;
932     }
933   }
934   #if defined(PETSC_H2OPUS_USE_GPU)
935   if (!boundtocpu) {
936     delete a->hmatrix;
937     delete a->dist_hmatrix;
938     a->hmatrix      = NULL;
939     a->dist_hmatrix = NULL;
940   }
941   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
942   #endif
943   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
944 
945   if (!a->s) a->s = 1.0;
946   A->assembled = PETSC_TRUE;
947 
948   if (samplingdone) {
949     PetscBool check  = a->check_construction;
950     PetscBool checke = PETSC_FALSE;
951 
952     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
953     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
954     if (check) {
955       Mat               E, Ae;
956       PetscReal         n1, ni, n2;
957       PetscReal         n1A, niA, n2A;
958       PetscErrorCodeFn *normfunc;
959 
960       Ae = a->sampler->GetSamplingMat();
961       PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
962       PetscCall(MatShellSetOperation(E, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
963       PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
964       PetscCall(MatNorm(E, NORM_1, &n1));
965       PetscCall(MatNorm(E, NORM_INFINITY, &ni));
966       PetscCall(MatNorm(E, NORM_2, &n2));
967       if (checke) {
968         Mat eA, eE, eAe;
969 
970         PetscCall(MatComputeOperator(A, MATAIJ, &eA));
971         PetscCall(MatComputeOperator(E, MATAIJ, &eE));
972         PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
973         PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
974         PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
975         PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
976         PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
977         PetscCall(MatView(eA, NULL));
978         PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
979         PetscCall(MatView(eAe, NULL));
980         PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
981         PetscCall(MatView(eE, NULL));
982         PetscCall(MatDestroy(&eA));
983         PetscCall(MatDestroy(&eE));
984         PetscCall(MatDestroy(&eAe));
985       }
986 
987       PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
988       PetscCall(MatSetOperation(Ae, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
989       PetscCall(MatNorm(Ae, NORM_1, &n1A));
990       PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
991       PetscCall(MatNorm(Ae, NORM_2, &n2A));
992       n1A = PetscMax(n1A, PETSC_SMALL);
993       n2A = PetscMax(n2A, PETSC_SMALL);
994       niA = PetscMax(niA, PETSC_SMALL);
995       PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
996       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
997       PetscCall(MatDestroy(&E));
998     }
999     a->sampler->SetSamplingMat(NULL);
1000   }
1001   PetscFunctionReturn(PETSC_SUCCESS);
1002 }
1003 
MatZeroEntries_H2OPUS(Mat A)1004 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1005 {
1006   PetscMPIInt size;
1007   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1008 
1009   PetscFunctionBegin;
1010   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1011   PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1012   a->hmatrix->clearData();
1013   #if defined(PETSC_H2OPUS_USE_GPU)
1014   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1015   #endif
1016   PetscFunctionReturn(PETSC_SUCCESS);
1017 }
1018 
MatDuplicate_H2OPUS(Mat B,MatDuplicateOption op,Mat * nA)1019 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1020 {
1021   Mat         A;
1022   Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1023   PetscBool   iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1024   MPI_Comm    comm;
1025 
1026   PetscFunctionBegin;
1027   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1028   PetscCall(MatCreate(comm, &A));
1029   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1030   PetscCall(MatSetType(A, MATH2OPUS));
1031   PetscCall(MatPropagateSymmetryOptions(B, A));
1032   a = (Mat_H2OPUS *)A->data;
1033 
1034   a->eta              = b->eta;
1035   a->leafsize         = b->leafsize;
1036   a->basisord         = b->basisord;
1037   a->max_rank         = b->max_rank;
1038   a->bs               = b->bs;
1039   a->rtol             = b->rtol;
1040   a->norm_max_samples = b->norm_max_samples;
1041   if (op == MAT_COPY_VALUES) a->s = b->s;
1042 
1043   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1044   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1045 
1046   #if defined(H2OPUS_USE_MPI)
1047   if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1048     #if defined(PETSC_H2OPUS_USE_GPU)
1049   if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1050     #endif
1051   #endif
1052   if (b->hmatrix) {
1053     a->hmatrix = new HMatrix(*b->hmatrix);
1054     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1055   }
1056   #if defined(PETSC_H2OPUS_USE_GPU)
1057   if (b->hmatrix_gpu) {
1058     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1059     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1060   }
1061   #endif
1062   if (b->sf) {
1063     PetscCall(PetscObjectReference((PetscObject)b->sf));
1064     a->sf = b->sf;
1065   }
1066   if (b->h2opus_indexmap) {
1067     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1068     a->h2opus_indexmap = b->h2opus_indexmap;
1069   }
1070 
1071   PetscCall(MatSetUp(A));
1072   PetscCall(MatSetUpMultiply_H2OPUS(A));
1073   if (op == MAT_COPY_VALUES) {
1074     A->assembled  = PETSC_TRUE;
1075     a->orthogonal = b->orthogonal;
1076   #if defined(PETSC_H2OPUS_USE_GPU)
1077     A->offloadmask = B->offloadmask;
1078   #endif
1079   }
1080   #if defined(PETSC_H2OPUS_USE_GPU)
1081   iscpu = B->boundtocpu;
1082   #endif
1083   PetscCall(MatBindToCPU(A, iscpu));
1084 
1085   *nA = A;
1086   PetscFunctionReturn(PETSC_SUCCESS);
1087 }
1088 
MatView_H2OPUS(Mat A,PetscViewer view)1089 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1090 {
1091   Mat_H2OPUS       *h2opus = (Mat_H2OPUS *)A->data;
1092   PetscBool         isascii, vieweps;
1093   PetscMPIInt       size;
1094   PetscViewerFormat format;
1095 
1096   PetscFunctionBegin;
1097   PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1098   PetscCall(PetscViewerGetFormat(view, &format));
1099   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1100   if (isascii) {
1101     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1102       if (size == 1) {
1103         FILE *fp;
1104         PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1105         dumpHMatrix(*h2opus->hmatrix, 6, fp);
1106       }
1107     } else {
1108       PetscCall(PetscViewerASCIIPrintf(view, "  H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1109       PetscCall(PetscViewerASCIIPrintf(view, "  PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1110       PetscCall(PetscViewerASCIIPrintf(view, "  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1111       if (!h2opus->kernel) {
1112         PetscCall(PetscViewerASCIIPrintf(view, "  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1113       } else {
1114         PetscCall(PetscViewerASCIIPrintf(view, "  Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1115       }
1116       PetscCall(PetscViewerASCIIPrintf(view, "  Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1117       if (size == 1) {
1118         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1119         double low_rank_cpu  = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1120   #if defined(PETSC_HAVE_CUDA)
1121         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1122         double low_rank_gpu  = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1123   #endif
1124         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1125   #if defined(PETSC_HAVE_CUDA)
1126         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1127   #endif
1128       } else {
1129   #if defined(PETSC_HAVE_CUDA)
1130         double      matrix_mem[4] = {0., 0., 0., 0.};
1131         PetscMPIInt rsize         = 4;
1132   #else
1133         double      matrix_mem[2] = {0., 0.};
1134         PetscMPIInt rsize         = 2;
1135   #endif
1136   #if defined(H2OPUS_USE_MPI)
1137         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1138         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1139     #if defined(PETSC_HAVE_CUDA)
1140         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1141         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1142     #endif
1143   #endif
1144         PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1145         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1146   #if defined(PETSC_HAVE_CUDA)
1147         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1148   #endif
1149       }
1150     }
1151   }
1152   vieweps = PETSC_FALSE;
1153   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1154   if (vieweps) {
1155     char        filename[256];
1156     const char *name;
1157 
1158     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1159     PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1160     PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1161     outputEps(*h2opus->hmatrix, filename);
1162   }
1163   PetscFunctionReturn(PETSC_SUCCESS);
1164 }
1165 
MatH2OpusSetCoords_H2OPUS(Mat A,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,MatH2OpusKernelFn * kernel,void * kernelctx)1166 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx)
1167 {
1168   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1169   PetscReal  *gcoords;
1170   PetscInt    N;
1171   MPI_Comm    comm;
1172   PetscMPIInt size;
1173   PetscBool   cong;
1174 
1175   PetscFunctionBegin;
1176   PetscCall(PetscLayoutSetUp(A->rmap));
1177   PetscCall(PetscLayoutSetUp(A->cmap));
1178   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1179   PetscCall(MatHasCongruentLayouts(A, &cong));
1180   PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1181   N = A->rmap->N;
1182   PetscCallMPI(MPI_Comm_size(comm, &size));
1183   if (spacedim > 0 && size > 1 && cdist) {
1184     PetscSF      sf;
1185     MPI_Datatype dtype;
1186 
1187     PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1188     PetscCallMPI(MPI_Type_commit(&dtype));
1189 
1190     PetscCall(PetscSFCreate(comm, &sf));
1191     PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1192     PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1193     PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1194     PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1195     PetscCall(PetscSFDestroy(&sf));
1196     PetscCallMPI(MPI_Type_free(&dtype));
1197   } else gcoords = (PetscReal *)coords;
1198 
1199   delete h2opus->ptcloud;
1200   delete h2opus->kernel;
1201   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1202   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1203   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1204   A->preallocated = PETSC_TRUE;
1205   PetscFunctionReturn(PETSC_SUCCESS);
1206 }
1207 
1208   #if defined(PETSC_H2OPUS_USE_GPU)
MatBindToCPU_H2OPUS(Mat A,PetscBool flg)1209 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1210 {
1211   PetscMPIInt size;
1212   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1213 
1214   PetscFunctionBegin;
1215   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1216   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1217     if (size > 1) {
1218       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1219     #if defined(H2OPUS_USE_MPI)
1220       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1221       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1222     #endif
1223     } else {
1224       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1225       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1226       else *a->hmatrix = *a->hmatrix_gpu;
1227     }
1228     delete a->hmatrix_gpu;
1229     delete a->dist_hmatrix_gpu;
1230     a->hmatrix_gpu      = NULL;
1231     a->dist_hmatrix_gpu = NULL;
1232     A->offloadmask      = PETSC_OFFLOAD_CPU;
1233   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1234     if (size > 1) {
1235       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1236     #if defined(H2OPUS_USE_MPI)
1237       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1238       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1239     #endif
1240     } else {
1241       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1242       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1243       else *a->hmatrix_gpu = *a->hmatrix;
1244     }
1245     delete a->hmatrix;
1246     delete a->dist_hmatrix;
1247     a->hmatrix      = NULL;
1248     a->dist_hmatrix = NULL;
1249     A->offloadmask  = PETSC_OFFLOAD_GPU;
1250   }
1251   PetscCall(PetscFree(A->defaultvectype));
1252   if (!flg) {
1253     PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1254   } else {
1255     PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1256   }
1257   A->boundtocpu = flg;
1258   PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260   #endif
1261 
1262 /*MC
1263    MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.
1264 
1265    Options Database Key:
1266 .  -mat_type h2opus - matrix type to "h2opus"
1267 
1268    Level: beginner
1269 
1270    Notes:
1271    H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.
1272 
1273    For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1274    In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.
1275 
1276 .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1277 M*/
MatCreate_H2OPUS(Mat A)1278 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1279 {
1280   Mat_H2OPUS *a;
1281   PetscMPIInt size;
1282 
1283   PetscFunctionBegin;
1284   #if defined(PETSC_H2OPUS_USE_GPU)
1285   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1286   #endif
1287   PetscCall(PetscNew(&a));
1288   A->data = (void *)a;
1289 
1290   a->eta              = 0.9;
1291   a->leafsize         = 32;
1292   a->basisord         = 4;
1293   a->max_rank         = 64;
1294   a->bs               = 32;
1295   a->rtol             = 1.e-4;
1296   a->s                = 1.0;
1297   a->norm_max_samples = 10;
1298   a->resize           = PETSC_TRUE; /* reallocate after compression */
1299   #if defined(H2OPUS_USE_MPI)
1300   h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1301   #else
1302   h2opusCreateHandle(&a->handle);
1303   #endif
1304   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1305   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1306   PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1307 
1308   A->ops->destroy          = MatDestroy_H2OPUS;
1309   A->ops->view             = MatView_H2OPUS;
1310   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1311   A->ops->mult             = MatMult_H2OPUS;
1312   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1313   A->ops->multadd          = MatMultAdd_H2OPUS;
1314   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1315   A->ops->scale            = MatScale_H2OPUS;
1316   A->ops->duplicate        = MatDuplicate_H2OPUS;
1317   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1318   A->ops->norm             = MatNorm_H2OPUS;
1319   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1320   #if defined(PETSC_H2OPUS_USE_GPU)
1321   A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1322   #endif
1323 
1324   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1325   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1326   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1327   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1328   #if defined(PETSC_H2OPUS_USE_GPU)
1329   PetscCall(PetscFree(A->defaultvectype));
1330   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1331   #endif
1332   PetscFunctionReturn(PETSC_SUCCESS);
1333 }
1334 
1335 /*@
1336   MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1337 
1338   Input Parameter:
1339 . A - the matrix
1340 
1341   Level: intermediate
1342 
1343 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1344 @*/
MatH2OpusOrthogonalize(Mat A)1345 PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1346 {
1347   PetscBool   ish2opus;
1348   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1349   PetscMPIInt size;
1350   PetscBool   boundtocpu = PETSC_TRUE;
1351 
1352   PetscFunctionBegin;
1353   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1354   PetscValidType(A, 1);
1355   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1356   if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1357   if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1358   HLibProfile::clear();
1359   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1360   #if defined(PETSC_H2OPUS_USE_GPU)
1361   boundtocpu = A->boundtocpu;
1362   #endif
1363   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1364   if (size > 1) {
1365     if (boundtocpu) {
1366       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1367   #if defined(H2OPUS_USE_MPI)
1368       distributed_horthog(*a->dist_hmatrix, a->handle);
1369   #endif
1370   #if defined(PETSC_H2OPUS_USE_GPU)
1371       A->offloadmask = PETSC_OFFLOAD_CPU;
1372     } else {
1373       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1374       PetscCall(PetscLogGpuTimeBegin());
1375     #if defined(H2OPUS_USE_MPI)
1376       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1377     #endif
1378       PetscCall(PetscLogGpuTimeEnd());
1379   #endif
1380     }
1381   } else {
1382   #if defined(H2OPUS_USE_MPI)
1383     h2opusHandle_t handle = a->handle->handle;
1384   #else
1385     h2opusHandle_t handle = a->handle;
1386   #endif
1387     if (boundtocpu) {
1388       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1389       horthog(*a->hmatrix, handle);
1390   #if defined(PETSC_H2OPUS_USE_GPU)
1391       A->offloadmask = PETSC_OFFLOAD_CPU;
1392     } else {
1393       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1394       PetscCall(PetscLogGpuTimeBegin());
1395       horthog(*a->hmatrix_gpu, handle);
1396       PetscCall(PetscLogGpuTimeEnd());
1397   #endif
1398     }
1399   }
1400   a->orthogonal = PETSC_TRUE;
1401   { /* log flops */
1402     double gops, time, perf, dev;
1403     HLibProfile::getHorthogPerf(gops, time, perf, dev);
1404   #if defined(PETSC_H2OPUS_USE_GPU)
1405     if (boundtocpu) {
1406       PetscCall(PetscLogFlops(1e9 * gops));
1407     } else {
1408       PetscCall(PetscLogGpuFlops(1e9 * gops));
1409     }
1410   #else
1411     PetscCall(PetscLogFlops(1e9 * gops));
1412   #endif
1413   }
1414   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1415   PetscFunctionReturn(PETSC_SUCCESS);
1416 }
1417 
1418 /*@
1419   MatH2OpusCompress - Compress a hierarchical matrix.
1420 
1421   Input Parameters:
1422 + A   - the matrix
1423 - tol - the absolute truncation threshold
1424 
1425   Level: intermediate
1426 
1427 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1428 @*/
MatH2OpusCompress(Mat A,PetscReal tol)1429 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1430 {
1431   PetscBool   ish2opus;
1432   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1433   PetscMPIInt size;
1434   PetscBool   boundtocpu = PETSC_TRUE;
1435 
1436   PetscFunctionBegin;
1437   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1438   PetscValidType(A, 1);
1439   PetscValidLogicalCollectiveReal(A, tol, 2);
1440   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1441   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1442   PetscCall(MatH2OpusOrthogonalize(A));
1443   HLibProfile::clear();
1444   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1445   #if defined(PETSC_H2OPUS_USE_GPU)
1446   boundtocpu = A->boundtocpu;
1447   #endif
1448   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1449   if (size > 1) {
1450     if (boundtocpu) {
1451       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1452   #if defined(H2OPUS_USE_MPI)
1453       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1454       if (a->resize) {
1455         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1456         delete a->dist_hmatrix;
1457         a->dist_hmatrix = dist_hmatrix;
1458       }
1459   #endif
1460   #if defined(PETSC_H2OPUS_USE_GPU)
1461       A->offloadmask = PETSC_OFFLOAD_CPU;
1462     } else {
1463       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1464       PetscCall(PetscLogGpuTimeBegin());
1465     #if defined(H2OPUS_USE_MPI)
1466       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1467 
1468       if (a->resize) {
1469         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1470         delete a->dist_hmatrix_gpu;
1471         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1472       }
1473     #endif
1474       PetscCall(PetscLogGpuTimeEnd());
1475   #endif
1476     }
1477   } else {
1478   #if defined(H2OPUS_USE_MPI)
1479     h2opusHandle_t handle = a->handle->handle;
1480   #else
1481     h2opusHandle_t handle = a->handle;
1482   #endif
1483     if (boundtocpu) {
1484       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1485       hcompress(*a->hmatrix, tol, handle);
1486 
1487       if (a->resize) {
1488         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1489         delete a->hmatrix;
1490         a->hmatrix = hmatrix;
1491       }
1492   #if defined(PETSC_H2OPUS_USE_GPU)
1493       A->offloadmask = PETSC_OFFLOAD_CPU;
1494     } else {
1495       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1496       PetscCall(PetscLogGpuTimeBegin());
1497       hcompress(*a->hmatrix_gpu, tol, handle);
1498       PetscCall(PetscLogGpuTimeEnd());
1499 
1500       if (a->resize) {
1501         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1502         delete a->hmatrix_gpu;
1503         a->hmatrix_gpu = hmatrix_gpu;
1504       }
1505   #endif
1506     }
1507   }
1508   { /* log flops */
1509     double gops, time, perf, dev;
1510     HLibProfile::getHcompressPerf(gops, time, perf, dev);
1511   #if defined(PETSC_H2OPUS_USE_GPU)
1512     if (boundtocpu) {
1513       PetscCall(PetscLogFlops(1e9 * gops));
1514     } else {
1515       PetscCall(PetscLogGpuFlops(1e9 * gops));
1516     }
1517   #else
1518     PetscCall(PetscLogFlops(1e9 * gops));
1519   #endif
1520   }
1521   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1522   PetscFunctionReturn(PETSC_SUCCESS);
1523 }
1524 
1525 /*@
1526   MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.
1527 
1528   Input Parameters:
1529 + A   - the hierarchical matrix
1530 . B   - the matrix to be sampled
1531 . bs  - maximum number of samples to be taken concurrently
1532 - tol - relative tolerance for construction
1533 
1534   Level: intermediate
1535 
1536   Notes:
1537   You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1538 
1539 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1540 @*/
MatH2OpusSetSamplingMat(Mat A,Mat B,PetscInt bs,PetscReal tol)1541 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1542 {
1543   PetscBool ish2opus;
1544 
1545   PetscFunctionBegin;
1546   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1547   PetscValidType(A, 1);
1548   if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
1549   PetscValidLogicalCollectiveInt(A, bs, 3);
1550   PetscValidLogicalCollectiveReal(A, tol, 4);
1551   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1552   if (ish2opus) {
1553     Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1554 
1555     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1556     a->sampler->SetSamplingMat(B);
1557     if (bs > 0) a->bs = bs;
1558     if (tol > 0.) a->rtol = tol;
1559     delete a->kernel;
1560   }
1561   PetscFunctionReturn(PETSC_SUCCESS);
1562 }
1563 
1564 /*@C
1565   MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1566 
1567   Input Parameters:
1568 + comm      - MPI communicator
1569 . m         - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1570 . n         - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1571 . M         - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1572 . N         - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1573 . spacedim  - dimension of the space coordinates
1574 . coords    - coordinates of the points
1575 . cdist     - whether or not coordinates are distributed
1576 . kernel    - computational kernel (or `NULL`)
1577 . kernelctx - kernel context
1578 . eta       - admissibility condition tolerance
1579 . leafsize  - leaf size in cluster tree
1580 - basisord  - approximation order for Chebychev interpolation of low-rank blocks
1581 
1582   Output Parameter:
1583 . nA - matrix
1584 
1585   Options Database Keys:
1586 + -mat_h2opus_leafsize <`PetscInt`>    - Leaf size of cluster tree
1587 . -mat_h2opus_eta <`PetscReal`>        - Admissibility condition tolerance
1588 . -mat_h2opus_order <`PetscInt`>       - Chebychev approximation order
1589 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1590 
1591   Level: intermediate
1592 
1593 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1594 @*/
MatCreateH2OpusFromKernel(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,MatH2OpusKernelFn * kernel,void * kernelctx,PetscReal eta,PetscInt leafsize,PetscInt basisord,Mat * nA)1595 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1596 {
1597   Mat         A;
1598   Mat_H2OPUS *h2opus;
1599   PetscBool   iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1600 
1601   PetscFunctionBegin;
1602   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1603   PetscCall(MatCreate(comm, &A));
1604   PetscCall(MatSetSizes(A, m, n, M, N));
1605   PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1606   PetscCall(MatSetType(A, MATH2OPUS));
1607   PetscCall(MatBindToCPU(A, iscpu));
1608   PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1609 
1610   h2opus = (Mat_H2OPUS *)A->data;
1611   if (eta > 0.) h2opus->eta = eta;
1612   if (leafsize > 0) h2opus->leafsize = leafsize;
1613   if (basisord > 0) h2opus->basisord = basisord;
1614 
1615   *nA = A;
1616   PetscFunctionReturn(PETSC_SUCCESS);
1617 }
1618 
1619 /*@
1620   MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1621 
1622   Input Parameters:
1623 + B        - the matrix to be sampled
1624 . spacedim - dimension of the space coordinates
1625 . coords   - coordinates of the points
1626 . cdist    - whether or not coordinates are distributed
1627 . eta      - admissibility condition tolerance
1628 . leafsize - leaf size in cluster tree
1629 . maxrank  - maximum rank allowed
1630 . bs       - maximum number of samples to be taken concurrently
1631 - rtol     - relative tolerance for construction
1632 
1633   Output Parameter:
1634 . nA - matrix
1635 
1636   Options Database Keys:
1637 + -mat_h2opus_leafsize <`PetscInt`>      - Leaf size of cluster tree
1638 . -mat_h2opus_eta <`PetscReal`>          - Admissibility condition tolerance
1639 . -mat_h2opus_maxrank <`PetscInt`>       - Maximum rank when constructed from matvecs
1640 . -mat_h2opus_samples <`PetscInt`>       - Maximum number of samples to be taken concurrently when constructing from matvecs
1641 . -mat_h2opus_rtol <`PetscReal`>         - Relative tolerance for construction from sampling
1642 . -mat_h2opus_check <`PetscBool`>        - Check error when constructing from sampling during MatAssemblyEnd()
1643 . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1644 - -mat_h2opus_normsamples <`PetscInt`>   - Maximum number of samples to be when estimating norms
1645 
1646   Level: intermediate
1647 
1648   Note:
1649   Not available in parallel
1650 
1651 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1652 @*/
MatCreateH2OpusFromMat(Mat B,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,PetscReal eta,PetscInt leafsize,PetscInt maxrank,PetscInt bs,PetscReal rtol,Mat * nA)1653 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1654 {
1655   Mat         A;
1656   Mat_H2OPUS *h2opus;
1657   MPI_Comm    comm;
1658   PetscBool   boundtocpu = PETSC_TRUE;
1659 
1660   PetscFunctionBegin;
1661   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
1662   PetscValidLogicalCollectiveInt(B, spacedim, 2);
1663   PetscValidLogicalCollectiveBool(B, cdist, 4);
1664   PetscValidLogicalCollectiveReal(B, eta, 5);
1665   PetscValidLogicalCollectiveInt(B, leafsize, 6);
1666   PetscValidLogicalCollectiveInt(B, maxrank, 7);
1667   PetscValidLogicalCollectiveInt(B, bs, 8);
1668   PetscValidLogicalCollectiveReal(B, rtol, 9);
1669   PetscAssertPointer(nA, 10);
1670   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1671   PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1672   PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1673   PetscCall(MatCreate(comm, &A));
1674   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1675   #if defined(PETSC_H2OPUS_USE_GPU)
1676   {
1677     VecType   vtype;
1678     PetscBool isstd, iscuda, iskok;
1679 
1680     PetscCall(MatGetVecType(B, &vtype));
1681     PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1682     PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1683     PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1684     PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1685     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1686     if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1687   }
1688   #endif
1689   PetscCall(MatSetType(A, MATH2OPUS));
1690   PetscCall(MatBindToCPU(A, boundtocpu));
1691   if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1692   PetscCall(MatPropagateSymmetryOptions(B, A));
1693   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1694 
1695   h2opus          = (Mat_H2OPUS *)A->data;
1696   h2opus->sampler = new PetscMatrixSampler(B);
1697   if (eta > 0.) h2opus->eta = eta;
1698   if (leafsize > 0) h2opus->leafsize = leafsize;
1699   if (maxrank > 0) h2opus->max_rank = maxrank;
1700   if (bs > 0) h2opus->bs = bs;
1701   if (rtol > 0.) h2opus->rtol = rtol;
1702   *nA             = A;
1703   A->preallocated = PETSC_TRUE;
1704   PetscFunctionReturn(PETSC_SUCCESS);
1705 }
1706 
1707 /*@
1708   MatH2OpusGetIndexMap - Access reordering index set.
1709 
1710   Input Parameter:
1711 . A - the matrix
1712 
1713   Output Parameter:
1714 . indexmap - the index set for the reordering
1715 
1716   Level: intermediate
1717 
1718 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1719 @*/
MatH2OpusGetIndexMap(Mat A,IS * indexmap)1720 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1721 {
1722   PetscBool   ish2opus;
1723   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1724 
1725   PetscFunctionBegin;
1726   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1727   PetscValidType(A, 1);
1728   PetscAssertPointer(indexmap, 2);
1729   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1730   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1731   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1732   *indexmap = a->h2opus_indexmap;
1733   PetscFunctionReturn(PETSC_SUCCESS);
1734 }
1735 
1736 /*@
1737   MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1738 
1739   Input Parameters:
1740 + A             - the matrix
1741 . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1742 - in            - the vector to be mapped
1743 
1744   Output Parameter:
1745 . out - the newly created mapped vector
1746 
1747   Level: intermediate
1748 
1749 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1750 @*/
MatH2OpusMapVec(Mat A,PetscBool nativetopetsc,Vec in,Vec * out)1751 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1752 {
1753   PetscBool    ish2opus;
1754   Mat_H2OPUS  *a = (Mat_H2OPUS *)A->data;
1755   PetscScalar *xin, *xout;
1756   PetscBool    nm;
1757 
1758   PetscFunctionBegin;
1759   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1760   PetscValidType(A, 1);
1761   PetscValidLogicalCollectiveBool(A, nativetopetsc, 2);
1762   PetscValidHeaderSpecific(in, VEC_CLASSID, 3);
1763   PetscAssertPointer(out, 4);
1764   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1765   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1766   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1767   nm = a->nativemult;
1768   PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1769   PetscCall(MatCreateVecs(A, out, NULL));
1770   PetscCall(MatH2OpusSetNativeMult(A, nm));
1771   if (!a->sf) { /* same ordering */
1772     PetscCall(VecCopy(in, *out));
1773     PetscFunctionReturn(PETSC_SUCCESS);
1774   }
1775   PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1776   PetscCall(VecGetArrayWrite(*out, &xout));
1777   if (nativetopetsc) {
1778     PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1779     PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1780   } else {
1781     PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1782     PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1783   }
1784   PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1785   PetscCall(VecRestoreArrayWrite(*out, &xout));
1786   PetscFunctionReturn(PETSC_SUCCESS);
1787 }
1788 
1789 /*@
1790   MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $
1791 
1792   Input Parameters:
1793 + A - the hierarchical `MATH2OPUS` matrix
1794 . s - the scaling factor
1795 . U - the dense low-rank update matrix
1796 - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)
1797 
1798   Note:
1799   The `U` and `V` matrices must be in `MATDENSE` dense format
1800 
1801   Level: intermediate
1802 
1803 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1804 @*/
MatH2OpusLowRankUpdate(Mat A,Mat U,Mat V,PetscScalar s)1805 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1806 {
1807   PetscBool flg;
1808 
1809   PetscFunctionBegin;
1810   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1811   PetscValidType(A, 1);
1812   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1813   PetscValidHeaderSpecific(U, MAT_CLASSID, 2);
1814   PetscCheckSameComm(A, 1, U, 2);
1815   if (V) {
1816     PetscValidHeaderSpecific(V, MAT_CLASSID, 3);
1817     PetscCheckSameComm(A, 1, V, 3);
1818   }
1819   PetscValidLogicalCollectiveScalar(A, s, 4);
1820 
1821   if (!V) V = U;
1822   PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1823   if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1824   PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1825   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1826   PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1827   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1828   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1829   if (flg) {
1830     Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
1831     const PetscScalar *u, *v, *uu, *vv;
1832     PetscInt           ldu, ldv;
1833     PetscMPIInt        size;
1834   #if defined(H2OPUS_USE_MPI)
1835     h2opusHandle_t handle = a->handle->handle;
1836   #else
1837     h2opusHandle_t handle = a->handle;
1838   #endif
1839     PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1840     PetscSF   usf, vsf;
1841 
1842     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1843     PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1844     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1845     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1846     PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1847     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1848     PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1849     PetscCall(MatDenseGetLDA(U, &ldu));
1850     PetscCall(MatDenseGetLDA(V, &ldv));
1851     PetscCall(MatBoundToCPU(A, &flg));
1852     if (usesf) {
1853       PetscInt n;
1854 
1855       PetscCall(MatDenseGetH2OpusStridedSF(U, a->sf, &usf));
1856       PetscCall(MatDenseGetH2OpusStridedSF(V, a->sf, &vsf));
1857       PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1858       PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1859       ldu = n;
1860       ldv = n;
1861     }
1862     if (flg) {
1863       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1864       PetscCall(MatDenseGetArrayRead(U, &u));
1865       PetscCall(MatDenseGetArrayRead(V, &v));
1866       if (usesf) {
1867         vv = MatH2OpusGetThrustPointer(*a->yy);
1868         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1869         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1870         if (U != V) {
1871           uu = MatH2OpusGetThrustPointer(*a->xx);
1872           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1873           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1874         } else uu = vv;
1875       } else {
1876         uu = u;
1877         vv = v;
1878       }
1879       hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1880       PetscCall(MatDenseRestoreArrayRead(U, &u));
1881       PetscCall(MatDenseRestoreArrayRead(V, &v));
1882     } else {
1883   #if defined(PETSC_H2OPUS_USE_GPU)
1884       PetscBool flgU, flgV;
1885 
1886       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1887       PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1888       if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1889       PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1890       if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1891       PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1892       PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1893       if (usesf) {
1894         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1895         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1896         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1897         if (U != V) {
1898           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1899           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1900           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1901         } else uu = vv;
1902       } else {
1903         uu = u;
1904         vv = v;
1905       }
1906   #else
1907       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1908   #endif
1909       hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1910   #if defined(PETSC_H2OPUS_USE_GPU)
1911       PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1912       PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1913       if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1914       if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1915   #endif
1916     }
1917     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1918     a->orthogonal = PETSC_FALSE;
1919   }
1920   PetscFunctionReturn(PETSC_SUCCESS);
1921 }
1922 #endif
1923