1 #include <h2opusconf.h>
2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4 #include <h2opus.h>
5 #if defined(H2OPUS_USE_MPI)
6 #include <h2opus/distributed/distributed_h2opus_handle.h>
7 #include <h2opus/distributed/distributed_geometric_construction.h>
8 #include <h2opus/distributed/distributed_hgemv.h>
9 #include <h2opus/distributed/distributed_horthog.h>
10 #include <h2opus/distributed/distributed_hcompress.h>
11 #endif
12 #include <h2opus/util/boxentrygen.h>
13 #include <petsc/private/matimpl.h>
14 #include <petsc/private/vecimpl.h>
15 #include <petsc/private/deviceimpl.h> /*I "petscmat.h" I*/
16 #include <petscsf.h>
17
18 /* math2opusutils */
19 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusStridedSF(Mat, PetscSF, PetscSF *);
20 PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
21 PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
22 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
23
24 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
25
26 /* Use GPU only if H2OPUS is configured for GPU */
27 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
28 #define PETSC_H2OPUS_USE_GPU
29 #endif
30 #if defined(PETSC_H2OPUS_USE_GPU)
31 #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
32 #else
33 #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
34 #endif
35
36 // TODO H2OPUS:
37 // DistributedHMatrix
38 // unsymmetric ?
39 // transpose for distributed_hgemv?
40 // clearData()
41 // Unify interface for sequential and parallel?
42 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
43 //
44 template <class T>
45 class PetscPointCloud : public H2OpusDataSet<T> {
46 private:
47 int dimension;
48 size_t num_points;
49 std::vector<T> pts;
50
51 public:
PetscPointCloud(int dim,size_t num_pts,const T coords[])52 PetscPointCloud(int dim, size_t num_pts, const T coords[])
53 {
54 dim = dim > 0 ? dim : 1;
55 this->dimension = dim;
56 this->num_points = num_pts;
57
58 pts.resize(num_pts * dim);
59 if (coords) {
60 for (size_t n = 0; n < num_pts; n++)
61 for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
62 } else {
63 PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
64 for (size_t n = 0; n < num_pts; n++) {
65 pts[n * dim] = n * h;
66 for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
67 }
68 }
69 }
70
PetscPointCloud(const PetscPointCloud<T> & other)71 PetscPointCloud(const PetscPointCloud<T> &other)
72 {
73 size_t N = other.dimension * other.num_points;
74 this->dimension = other.dimension;
75 this->num_points = other.num_points;
76 this->pts.resize(N);
77 for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
78 }
79
getDimension() const80 int getDimension() const { return dimension; }
81
getDataSetSize() const82 size_t getDataSetSize() const { return num_points; }
83
getDataPoint(size_t idx,int dim) const84 T getDataPoint(size_t idx, int dim) const
85 {
86 assert(dim < dimension && idx < num_points);
87 return pts[idx * dimension + dim];
88 }
89
Print(std::ostream & out=std::cout)90 void Print(std::ostream &out = std::cout)
91 {
92 out << "Dimension: " << dimension << std::endl;
93 out << "NumPoints: " << num_points << std::endl;
94 for (size_t n = 0; n < num_points; n++) {
95 for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
96 out << std::endl;
97 }
98 }
99 };
100
101 template <class T>
102 class PetscFunctionGenerator {
103 private:
104 MatH2OpusKernelFn *k;
105 int dim;
106 void *ctx;
107
108 public:
PetscFunctionGenerator(MatH2OpusKernelFn * k,int dim,PetscCtx ctx)109 PetscFunctionGenerator(MatH2OpusKernelFn *k, int dim, PetscCtx ctx)
110 {
111 this->k = k;
112 this->dim = dim;
113 this->ctx = ctx;
114 }
PetscFunctionGenerator(PetscFunctionGenerator & other)115 PetscFunctionGenerator(PetscFunctionGenerator &other)
116 {
117 this->k = other.k;
118 this->dim = other.dim;
119 this->ctx = other.ctx;
120 }
operator ()(PetscReal * pt1,PetscReal * pt2)121 T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
122 };
123
124 #include <../src/mat/impls/h2opus/math2opussampler.hpp>
125
126 /* just to not clutter the code */
127 #if !defined(H2OPUS_USE_GPU)
128 typedef HMatrix HMatrix_GPU;
129 #if defined(H2OPUS_USE_MPI)
130 typedef DistributedHMatrix DistributedHMatrix_GPU;
131 #endif
132 #endif
133
134 typedef struct {
135 #if defined(H2OPUS_USE_MPI)
136 distributedH2OpusHandle_t handle;
137 #else
138 h2opusHandle_t handle;
139 #endif
140 /* Sequential and parallel matrices are two different classes at the moment */
141 HMatrix *hmatrix;
142 #if defined(H2OPUS_USE_MPI)
143 DistributedHMatrix *dist_hmatrix;
144 #else
145 HMatrix *dist_hmatrix; /* just to not clutter the code */
146 #endif
147 /* May use permutations */
148 PetscSF sf;
149 PetscLayout h2opus_rmap, h2opus_cmap;
150 IS h2opus_indexmap;
151 thrust::host_vector<PetscScalar> *xx, *yy;
152 PetscInt xxs, yys;
153 PetscBool multsetup;
154
155 /* GPU */
156 HMatrix_GPU *hmatrix_gpu;
157 #if defined(H2OPUS_USE_MPI)
158 DistributedHMatrix_GPU *dist_hmatrix_gpu;
159 #else
160 HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
161 #endif
162 #if defined(PETSC_H2OPUS_USE_GPU)
163 thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
164 PetscInt xxs_gpu, yys_gpu;
165 #endif
166
167 /* construction from matvecs */
168 PetscMatrixSampler *sampler;
169 PetscBool nativemult;
170
171 /* Admissibility */
172 PetscReal eta;
173 PetscInt leafsize;
174
175 /* for dof reordering */
176 PetscPointCloud<PetscReal> *ptcloud;
177
178 /* kernel for generating matrix entries */
179 PetscFunctionGenerator<PetscScalar> *kernel;
180
181 /* basis orthogonalized? */
182 PetscBool orthogonal;
183
184 /* customization */
185 PetscInt basisord;
186 PetscInt max_rank;
187 PetscInt bs;
188 PetscReal rtol;
189 PetscInt norm_max_samples;
190 PetscBool check_construction;
191 PetscBool hara_verbose;
192 PetscBool resize;
193
194 /* keeps track of MatScale values */
195 PetscScalar s;
196 } Mat_H2OPUS;
197
MatDestroy_H2OPUS(Mat A)198 static PetscErrorCode MatDestroy_H2OPUS(Mat A)
199 {
200 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
201
202 PetscFunctionBegin;
203 #if defined(H2OPUS_USE_MPI)
204 h2opusDestroyDistributedHandle(a->handle);
205 #else
206 h2opusDestroyHandle(a->handle);
207 #endif
208 delete a->dist_hmatrix;
209 delete a->hmatrix;
210 PetscCall(PetscSFDestroy(&a->sf));
211 PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
212 PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
213 PetscCall(ISDestroy(&a->h2opus_indexmap));
214 delete a->xx;
215 delete a->yy;
216 delete a->hmatrix_gpu;
217 delete a->dist_hmatrix_gpu;
218 #if defined(PETSC_H2OPUS_USE_GPU)
219 delete a->xx_gpu;
220 delete a->yy_gpu;
221 #endif
222 delete a->sampler;
223 delete a->ptcloud;
224 delete a->kernel;
225 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
226 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
227 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
228 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
229 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
230 PetscCall(PetscFree(A->data));
231 PetscFunctionReturn(PETSC_SUCCESS);
232 }
233
MatH2OpusSetNativeMult(Mat A,PetscBool nm)234 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235 {
236 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
237 PetscBool ish2opus;
238
239 PetscFunctionBegin;
240 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
241 PetscValidLogicalCollectiveBool(A, nm, 2);
242 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
243 if (ish2opus) {
244 if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
245 if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
246 PetscLayout t;
247 t = A->rmap;
248 A->rmap = a->h2opus_rmap;
249 a->h2opus_rmap = t;
250 t = A->cmap;
251 A->cmap = a->h2opus_cmap;
252 a->h2opus_cmap = t;
253 }
254 }
255 a->nativemult = nm;
256 }
257 PetscFunctionReturn(PETSC_SUCCESS);
258 }
259
MatH2OpusGetNativeMult(Mat A,PetscBool * nm)260 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
261 {
262 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
263 PetscBool ish2opus;
264
265 PetscFunctionBegin;
266 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
267 PetscAssertPointer(nm, 2);
268 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
269 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
270 *nm = a->nativemult;
271 PetscFunctionReturn(PETSC_SUCCESS);
272 }
273
MatNorm_H2OPUS(Mat A,NormType normtype,PetscReal * n)274 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
275 {
276 PetscBool ish2opus;
277 PetscInt nmax = PETSC_DECIDE;
278 Mat_H2OPUS *a = NULL;
279 PetscBool mult = PETSC_FALSE;
280
281 PetscFunctionBegin;
282 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
283 if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
284 a = (Mat_H2OPUS *)A->data;
285
286 nmax = a->norm_max_samples;
287 mult = a->nativemult;
288 PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
289 } else {
290 PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
291 }
292 PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
293 if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
294 PetscFunctionReturn(PETSC_SUCCESS);
295 }
296
MatH2OpusResizeBuffers_Private(Mat A,PetscInt xN,PetscInt yN)297 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
298 {
299 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
300 PetscInt n;
301 PetscBool boundtocpu = PETSC_TRUE;
302
303 PetscFunctionBegin;
304 #if defined(PETSC_H2OPUS_USE_GPU)
305 boundtocpu = A->boundtocpu;
306 #endif
307 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
308 if (boundtocpu) {
309 if (h2opus->xxs < xN) {
310 h2opus->xx->resize(n * xN);
311 h2opus->xxs = xN;
312 }
313 if (h2opus->yys < yN) {
314 h2opus->yy->resize(n * yN);
315 h2opus->yys = yN;
316 }
317 }
318 #if defined(PETSC_H2OPUS_USE_GPU)
319 if (!boundtocpu) {
320 if (h2opus->xxs_gpu < xN) {
321 h2opus->xx_gpu->resize(n * xN);
322 h2opus->xxs_gpu = xN;
323 }
324 if (h2opus->yys_gpu < yN) {
325 h2opus->yy_gpu->resize(n * yN);
326 h2opus->yys_gpu = yN;
327 }
328 }
329 #endif
330 PetscFunctionReturn(PETSC_SUCCESS);
331 }
332
MatMultNKernel_H2OPUS(Mat A,PetscBool transA,Mat B,Mat C)333 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
334 {
335 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
336 #if defined(H2OPUS_USE_MPI)
337 h2opusHandle_t handle = h2opus->handle->handle;
338 #else
339 h2opusHandle_t handle = h2opus->handle;
340 #endif
341 PetscBool boundtocpu = PETSC_TRUE;
342 PetscScalar *xx, *yy, *uxx, *uyy;
343 PetscInt blda, clda;
344 PetscMPIInt size;
345 PetscSF bsf, csf;
346 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
347
348 PetscFunctionBegin;
349 HLibProfile::clear();
350 #if defined(PETSC_H2OPUS_USE_GPU)
351 boundtocpu = A->boundtocpu;
352 #endif
353 PetscCall(MatDenseGetLDA(B, &blda));
354 PetscCall(MatDenseGetLDA(C, &clda));
355 if (usesf) {
356 PetscInt n;
357
358 PetscCall(MatDenseGetH2OpusStridedSF(B, h2opus->sf, &bsf));
359 PetscCall(MatDenseGetH2OpusStridedSF(C, h2opus->sf, &csf));
360
361 PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
362 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
363 blda = n;
364 clda = n;
365 }
366 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
367 if (boundtocpu) {
368 PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
369 PetscCall(MatDenseGetArrayWrite(C, &yy));
370 if (usesf) {
371 uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
372 uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
373 PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
374 PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375 } else {
376 uxx = xx;
377 uyy = yy;
378 }
379 if (size > 1) {
380 PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
381 PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
382 #if defined(H2OPUS_USE_MPI)
383 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
384 #endif
385 } else {
386 PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
387 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
388 }
389 PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
390 if (usesf) {
391 PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
392 PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393 }
394 PetscCall(MatDenseRestoreArrayWrite(C, &yy));
395 #if defined(PETSC_H2OPUS_USE_GPU)
396 } else {
397 PetscBool ciscuda, biscuda;
398
399 /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
400 PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
401 if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
402 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
403 if (!ciscuda) {
404 C->assembled = PETSC_TRUE;
405 PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
406 }
407 PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
408 PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
409 if (usesf) {
410 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
411 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
412 PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
413 PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414 } else {
415 uxx = xx;
416 uyy = yy;
417 }
418 PetscCall(PetscLogGpuTimeBegin());
419 if (size > 1) {
420 PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
421 PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
422 #if defined(H2OPUS_USE_MPI)
423 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
424 #endif
425 } else {
426 PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
427 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
428 }
429 PetscCall(PetscLogGpuTimeEnd());
430 PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
431 if (usesf) {
432 PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
433 PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434 }
435 PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
436 if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
437 if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
438 #endif
439 }
440 { /* log flops */
441 double gops, time, perf, dev;
442 HLibProfile::getHgemvPerf(gops, time, perf, dev);
443 #if defined(PETSC_H2OPUS_USE_GPU)
444 if (boundtocpu) {
445 PetscCall(PetscLogFlops(1e9 * gops));
446 } else {
447 PetscCall(PetscLogGpuFlops(1e9 * gops));
448 }
449 #else
450 PetscCall(PetscLogFlops(1e9 * gops));
451 #endif
452 }
453 PetscFunctionReturn(PETSC_SUCCESS);
454 }
455
MatProductNumeric_H2OPUS(Mat C)456 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
457 {
458 Mat_Product *product = C->product;
459
460 PetscFunctionBegin;
461 MatCheckProduct(C, 1);
462 switch (product->type) {
463 case MATPRODUCT_AB:
464 PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
465 break;
466 case MATPRODUCT_AtB:
467 PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
468 break;
469 default:
470 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
471 }
472 PetscFunctionReturn(PETSC_SUCCESS);
473 }
474
MatProductSymbolic_H2OPUS(Mat C)475 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
476 {
477 Mat_Product *product = C->product;
478 PetscBool cisdense;
479 Mat A, B;
480
481 PetscFunctionBegin;
482 MatCheckProduct(C, 1);
483 A = product->A;
484 B = product->B;
485 switch (product->type) {
486 case MATPRODUCT_AB:
487 PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
488 PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
489 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
490 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
491 PetscCall(MatSetUp(C));
492 break;
493 case MATPRODUCT_AtB:
494 PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
495 PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
496 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
497 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
498 PetscCall(MatSetUp(C));
499 break;
500 default:
501 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
502 }
503 C->ops->productsymbolic = NULL;
504 C->ops->productnumeric = MatProductNumeric_H2OPUS;
505 PetscFunctionReturn(PETSC_SUCCESS);
506 }
507
MatProductSetFromOptions_H2OPUS(Mat C)508 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
509 {
510 PetscFunctionBegin;
511 MatCheckProduct(C, 1);
512 if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
513 PetscFunctionReturn(PETSC_SUCCESS);
514 }
515
MatMultKernel_H2OPUS(Mat A,Vec x,PetscScalar sy,Vec y,PetscBool trans)516 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517 {
518 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
519 #if defined(H2OPUS_USE_MPI)
520 h2opusHandle_t handle = h2opus->handle->handle;
521 #else
522 h2opusHandle_t handle = h2opus->handle;
523 #endif
524 PetscBool boundtocpu = PETSC_TRUE;
525 PetscInt n;
526 PetscScalar *xx, *yy, *uxx, *uyy;
527 PetscMPIInt size;
528 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
529
530 PetscFunctionBegin;
531 HLibProfile::clear();
532 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
533 #if defined(PETSC_H2OPUS_USE_GPU)
534 boundtocpu = A->boundtocpu;
535 #endif
536 if (usesf) PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
537 else n = A->rmap->n;
538 if (boundtocpu) {
539 PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
540 if (sy == 0.0) {
541 PetscCall(VecGetArrayWrite(y, &yy));
542 } else {
543 PetscCall(VecGetArray(y, &yy));
544 }
545 if (usesf) {
546 uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
547 uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
548
549 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
550 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
551 if (sy != 0.0) {
552 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
553 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
554 }
555 } else {
556 uxx = xx;
557 uyy = yy;
558 }
559 if (size > 1) {
560 PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
561 PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
562 #if defined(H2OPUS_USE_MPI)
563 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
564 #endif
565 } else {
566 PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
567 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
568 }
569 PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
570 if (usesf) {
571 PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
572 PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
573 }
574 if (sy == 0.0) {
575 PetscCall(VecRestoreArrayWrite(y, &yy));
576 } else {
577 PetscCall(VecRestoreArray(y, &yy));
578 }
579 #if defined(PETSC_H2OPUS_USE_GPU)
580 } else {
581 PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
582 if (sy == 0.0) {
583 PetscCall(VecCUDAGetArrayWrite(y, &yy));
584 } else {
585 PetscCall(VecCUDAGetArray(y, &yy));
586 }
587 if (usesf) {
588 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
589 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
590
591 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
592 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
593 if (sy != 0.0) {
594 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
595 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
596 }
597 } else {
598 uxx = xx;
599 uyy = yy;
600 }
601 PetscCall(PetscLogGpuTimeBegin());
602 if (size > 1) {
603 PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
604 PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
605 #if defined(H2OPUS_USE_MPI)
606 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
607 #endif
608 } else {
609 PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
610 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
611 }
612 PetscCall(PetscLogGpuTimeEnd());
613 PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
614 if (usesf) {
615 PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
616 PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
617 }
618 if (sy == 0.0) {
619 PetscCall(VecCUDARestoreArrayWrite(y, &yy));
620 } else {
621 PetscCall(VecCUDARestoreArray(y, &yy));
622 }
623 #endif
624 }
625 { /* log flops */
626 double gops, time, perf, dev;
627 HLibProfile::getHgemvPerf(gops, time, perf, dev);
628 #if defined(PETSC_H2OPUS_USE_GPU)
629 if (boundtocpu) {
630 PetscCall(PetscLogFlops(1e9 * gops));
631 } else {
632 PetscCall(PetscLogGpuFlops(1e9 * gops));
633 }
634 #else
635 PetscCall(PetscLogFlops(1e9 * gops));
636 #endif
637 }
638 PetscFunctionReturn(PETSC_SUCCESS);
639 }
640
MatMultTranspose_H2OPUS(Mat A,Vec x,Vec y)641 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
642 {
643 PetscBool xiscuda, yiscuda;
644
645 PetscFunctionBegin;
646 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
647 PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
648 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
649 PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
650 PetscFunctionReturn(PETSC_SUCCESS);
651 }
652
MatMult_H2OPUS(Mat A,Vec x,Vec y)653 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
654 {
655 PetscBool xiscuda, yiscuda;
656
657 PetscFunctionBegin;
658 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
659 PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
660 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
661 PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
662 PetscFunctionReturn(PETSC_SUCCESS);
663 }
664
MatMultTransposeAdd_H2OPUS(Mat A,Vec x,Vec y,Vec z)665 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
666 {
667 PetscBool xiscuda, ziscuda;
668
669 PetscFunctionBegin;
670 PetscCall(VecCopy(y, z));
671 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
672 PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
673 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
674 PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
675 PetscFunctionReturn(PETSC_SUCCESS);
676 }
677
MatMultAdd_H2OPUS(Mat A,Vec x,Vec y,Vec z)678 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
679 {
680 PetscBool xiscuda, ziscuda;
681
682 PetscFunctionBegin;
683 PetscCall(VecCopy(y, z));
684 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
685 PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
686 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
687 PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
688 PetscFunctionReturn(PETSC_SUCCESS);
689 }
690
MatScale_H2OPUS(Mat A,PetscScalar s)691 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
692 {
693 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
694
695 PetscFunctionBegin;
696 a->s *= s;
697 PetscFunctionReturn(PETSC_SUCCESS);
698 }
699
MatSetFromOptions_H2OPUS(Mat A,PetscOptionItems PetscOptionsObject)700 static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems PetscOptionsObject)
701 {
702 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
703
704 PetscFunctionBegin;
705 PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
706 PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
707 PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
708 PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
709 PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
710 PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
711 PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
712 PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
713 PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
714 PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
715 PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
716 PetscOptionsHeadEnd();
717 PetscFunctionReturn(PETSC_SUCCESS);
718 }
719
720 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernelFn *, void *);
721
MatH2OpusInferCoordinates_Private(Mat A)722 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
723 {
724 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
725 Vec c;
726 PetscInt spacedim;
727 const PetscScalar *coords;
728
729 PetscFunctionBegin;
730 if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
731 PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
732 if (!c && a->sampler) {
733 Mat S = a->sampler->GetSamplingMat();
734
735 PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
736 }
737 if (!c) {
738 PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
739 } else {
740 PetscCall(VecGetArrayRead(c, &coords));
741 PetscCall(VecGetBlockSize(c, &spacedim));
742 PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
743 PetscCall(VecRestoreArrayRead(c, &coords));
744 }
745 PetscFunctionReturn(PETSC_SUCCESS);
746 }
747
MatSetUpMultiply_H2OPUS(Mat A)748 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
749 {
750 MPI_Comm comm;
751 PetscMPIInt size;
752 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
753 PetscInt n = 0, *idx = NULL;
754 int *iidx = NULL;
755 PetscCopyMode own;
756 PetscBool rid;
757
758 PetscFunctionBegin;
759 if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
760 if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
761 PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
762 #if defined(PETSC_H2OPUS_USE_GPU)
763 a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
764 a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
765 a->xxs_gpu = 1;
766 a->yys_gpu = 1;
767 #endif
768 a->xx = new thrust::host_vector<PetscScalar>(n);
769 a->yy = new thrust::host_vector<PetscScalar>(n);
770 a->xxs = 1;
771 a->yys = 1;
772 } else {
773 IS is;
774 PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
775 PetscCallMPI(MPI_Comm_size(comm, &size));
776 if (!a->h2opus_indexmap) {
777 if (size > 1) {
778 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
779 #if defined(H2OPUS_USE_MPI)
780 iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
781 n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
782 #endif
783 } else {
784 iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
785 n = a->hmatrix->u_basis_tree.index_map.size();
786 }
787
788 if (PetscDefined(USE_64BIT_INDICES)) {
789 PetscInt i;
790
791 own = PETSC_OWN_POINTER;
792 PetscCall(PetscMalloc1(n, &idx));
793 for (i = 0; i < n; i++) idx[i] = iidx[i];
794 } else {
795 own = PETSC_COPY_VALUES;
796 idx = (PetscInt *)iidx;
797 }
798 PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
799 PetscCall(ISSetPermutation(is));
800 PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
801 a->h2opus_indexmap = is;
802 }
803 PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
804 PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
805 rid = (PetscBool)(n == A->rmap->n);
806 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPI_C_BOOL, MPI_LAND, comm));
807 if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
808 if (!rid) {
809 if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
810 PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
811 PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
812 PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
813 PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
814 }
815 PetscCall(PetscSFCreate(comm, &a->sf));
816 PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
817 PetscCall(PetscSFSetUp(a->sf));
818 PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
819 #if defined(PETSC_H2OPUS_USE_GPU)
820 a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
821 a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
822 a->xxs_gpu = 1;
823 a->yys_gpu = 1;
824 #endif
825 a->xx = new thrust::host_vector<PetscScalar>(n);
826 a->yy = new thrust::host_vector<PetscScalar>(n);
827 a->xxs = 1;
828 a->yys = 1;
829 }
830 PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
831 }
832 a->multsetup = PETSC_TRUE;
833 PetscFunctionReturn(PETSC_SUCCESS);
834 }
835
MatAssemblyEnd_H2OPUS(Mat A,MatAssemblyType assemblytype)836 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
837 {
838 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
839 #if defined(H2OPUS_USE_MPI)
840 h2opusHandle_t handle = a->handle->handle;
841 #else
842 h2opusHandle_t handle = a->handle;
843 #endif
844 PetscBool kernel = PETSC_FALSE;
845 PetscBool boundtocpu = PETSC_TRUE;
846 PetscBool samplingdone = PETSC_FALSE;
847 MPI_Comm comm;
848 PetscMPIInt size;
849
850 PetscFunctionBegin;
851 PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
852 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
853 PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
854
855 /* XXX */
856 a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
857
858 PetscCallMPI(MPI_Comm_size(comm, &size));
859 /* TODO REUSABILITY of geometric construction */
860 delete a->hmatrix;
861 delete a->dist_hmatrix;
862 #if defined(PETSC_H2OPUS_USE_GPU)
863 delete a->hmatrix_gpu;
864 delete a->dist_hmatrix_gpu;
865 #endif
866 a->orthogonal = PETSC_FALSE;
867
868 /* TODO: other? */
869 H2OpusBoxCenterAdmissibility adm(a->eta);
870
871 PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
872 if (size > 1) {
873 #if defined(H2OPUS_USE_MPI)
874 a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
875 #else
876 a->dist_hmatrix = NULL;
877 #endif
878 } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
879 PetscCall(MatH2OpusInferCoordinates_Private(A));
880 PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
881 if (a->kernel) {
882 BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
883 if (size > 1) {
884 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
885 #if defined(H2OPUS_USE_MPI)
886 buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
887 #endif
888 } else {
889 buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
890 }
891 kernel = PETSC_TRUE;
892 } else {
893 PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
894 buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
895 }
896 PetscCall(MatSetUpMultiply_H2OPUS(A));
897
898 #if defined(PETSC_H2OPUS_USE_GPU)
899 boundtocpu = A->boundtocpu;
900 if (!boundtocpu) {
901 if (size > 1) {
902 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
903 #if defined(H2OPUS_USE_MPI)
904 a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
905 #endif
906 } else {
907 a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
908 }
909 }
910 #endif
911 if (size == 1) {
912 if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
913 PetscReal Anorm;
914 bool verbose;
915
916 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
917 verbose = a->hara_verbose;
918 PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
919 if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
920 if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
921 a->sampler->SetStream(handle->getMainStream());
922 if (boundtocpu) {
923 a->sampler->SetGPUSampling(false);
924 hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
925 #if defined(PETSC_H2OPUS_USE_GPU)
926 } else {
927 a->sampler->SetGPUSampling(true);
928 hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
929 #endif
930 }
931 samplingdone = PETSC_TRUE;
932 }
933 }
934 #if defined(PETSC_H2OPUS_USE_GPU)
935 if (!boundtocpu) {
936 delete a->hmatrix;
937 delete a->dist_hmatrix;
938 a->hmatrix = NULL;
939 a->dist_hmatrix = NULL;
940 }
941 A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
942 #endif
943 PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
944
945 if (!a->s) a->s = 1.0;
946 A->assembled = PETSC_TRUE;
947
948 if (samplingdone) {
949 PetscBool check = a->check_construction;
950 PetscBool checke = PETSC_FALSE;
951
952 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
953 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
954 if (check) {
955 Mat E, Ae;
956 PetscReal n1, ni, n2;
957 PetscReal n1A, niA, n2A;
958 PetscErrorCodeFn *normfunc;
959
960 Ae = a->sampler->GetSamplingMat();
961 PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
962 PetscCall(MatShellSetOperation(E, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
963 PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
964 PetscCall(MatNorm(E, NORM_1, &n1));
965 PetscCall(MatNorm(E, NORM_INFINITY, &ni));
966 PetscCall(MatNorm(E, NORM_2, &n2));
967 if (checke) {
968 Mat eA, eE, eAe;
969
970 PetscCall(MatComputeOperator(A, MATAIJ, &eA));
971 PetscCall(MatComputeOperator(E, MATAIJ, &eE));
972 PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
973 PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
974 PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
975 PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
976 PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
977 PetscCall(MatView(eA, NULL));
978 PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
979 PetscCall(MatView(eAe, NULL));
980 PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
981 PetscCall(MatView(eE, NULL));
982 PetscCall(MatDestroy(&eA));
983 PetscCall(MatDestroy(&eE));
984 PetscCall(MatDestroy(&eAe));
985 }
986
987 PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
988 PetscCall(MatSetOperation(Ae, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
989 PetscCall(MatNorm(Ae, NORM_1, &n1A));
990 PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
991 PetscCall(MatNorm(Ae, NORM_2, &n2A));
992 n1A = PetscMax(n1A, PETSC_SMALL);
993 n2A = PetscMax(n2A, PETSC_SMALL);
994 niA = PetscMax(niA, PETSC_SMALL);
995 PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
996 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
997 PetscCall(MatDestroy(&E));
998 }
999 a->sampler->SetSamplingMat(NULL);
1000 }
1001 PetscFunctionReturn(PETSC_SUCCESS);
1002 }
1003
MatZeroEntries_H2OPUS(Mat A)1004 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1005 {
1006 PetscMPIInt size;
1007 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1008
1009 PetscFunctionBegin;
1010 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1011 PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1012 a->hmatrix->clearData();
1013 #if defined(PETSC_H2OPUS_USE_GPU)
1014 if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1015 #endif
1016 PetscFunctionReturn(PETSC_SUCCESS);
1017 }
1018
MatDuplicate_H2OPUS(Mat B,MatDuplicateOption op,Mat * nA)1019 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1020 {
1021 Mat A;
1022 Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1023 PetscBool iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1024 MPI_Comm comm;
1025
1026 PetscFunctionBegin;
1027 PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1028 PetscCall(MatCreate(comm, &A));
1029 PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1030 PetscCall(MatSetType(A, MATH2OPUS));
1031 PetscCall(MatPropagateSymmetryOptions(B, A));
1032 a = (Mat_H2OPUS *)A->data;
1033
1034 a->eta = b->eta;
1035 a->leafsize = b->leafsize;
1036 a->basisord = b->basisord;
1037 a->max_rank = b->max_rank;
1038 a->bs = b->bs;
1039 a->rtol = b->rtol;
1040 a->norm_max_samples = b->norm_max_samples;
1041 if (op == MAT_COPY_VALUES) a->s = b->s;
1042
1043 a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1044 if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1045
1046 #if defined(H2OPUS_USE_MPI)
1047 if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1048 #if defined(PETSC_H2OPUS_USE_GPU)
1049 if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1050 #endif
1051 #endif
1052 if (b->hmatrix) {
1053 a->hmatrix = new HMatrix(*b->hmatrix);
1054 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1055 }
1056 #if defined(PETSC_H2OPUS_USE_GPU)
1057 if (b->hmatrix_gpu) {
1058 a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1059 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1060 }
1061 #endif
1062 if (b->sf) {
1063 PetscCall(PetscObjectReference((PetscObject)b->sf));
1064 a->sf = b->sf;
1065 }
1066 if (b->h2opus_indexmap) {
1067 PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1068 a->h2opus_indexmap = b->h2opus_indexmap;
1069 }
1070
1071 PetscCall(MatSetUp(A));
1072 PetscCall(MatSetUpMultiply_H2OPUS(A));
1073 if (op == MAT_COPY_VALUES) {
1074 A->assembled = PETSC_TRUE;
1075 a->orthogonal = b->orthogonal;
1076 #if defined(PETSC_H2OPUS_USE_GPU)
1077 A->offloadmask = B->offloadmask;
1078 #endif
1079 }
1080 #if defined(PETSC_H2OPUS_USE_GPU)
1081 iscpu = B->boundtocpu;
1082 #endif
1083 PetscCall(MatBindToCPU(A, iscpu));
1084
1085 *nA = A;
1086 PetscFunctionReturn(PETSC_SUCCESS);
1087 }
1088
MatView_H2OPUS(Mat A,PetscViewer view)1089 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1090 {
1091 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1092 PetscBool isascii, vieweps;
1093 PetscMPIInt size;
1094 PetscViewerFormat format;
1095
1096 PetscFunctionBegin;
1097 PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1098 PetscCall(PetscViewerGetFormat(view, &format));
1099 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1100 if (isascii) {
1101 if (format == PETSC_VIEWER_ASCII_MATLAB) {
1102 if (size == 1) {
1103 FILE *fp;
1104 PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1105 dumpHMatrix(*h2opus->hmatrix, 6, fp);
1106 }
1107 } else {
1108 PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1109 PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1110 PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1111 if (!h2opus->kernel) {
1112 PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1113 } else {
1114 PetscCall(PetscViewerASCIIPrintf(view, " Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1115 }
1116 PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1117 if (size == 1) {
1118 double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1119 double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1120 #if defined(PETSC_HAVE_CUDA)
1121 double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1122 double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1123 #endif
1124 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1125 #if defined(PETSC_HAVE_CUDA)
1126 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1127 #endif
1128 } else {
1129 #if defined(PETSC_HAVE_CUDA)
1130 double matrix_mem[4] = {0., 0., 0., 0.};
1131 PetscMPIInt rsize = 4;
1132 #else
1133 double matrix_mem[2] = {0., 0.};
1134 PetscMPIInt rsize = 2;
1135 #endif
1136 #if defined(H2OPUS_USE_MPI)
1137 matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1138 matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1139 #if defined(PETSC_HAVE_CUDA)
1140 matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1141 matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1142 #endif
1143 #endif
1144 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1145 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1146 #if defined(PETSC_HAVE_CUDA)
1147 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1148 #endif
1149 }
1150 }
1151 }
1152 vieweps = PETSC_FALSE;
1153 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1154 if (vieweps) {
1155 char filename[256];
1156 const char *name;
1157
1158 PetscCall(PetscObjectGetName((PetscObject)A, &name));
1159 PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1160 PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1161 outputEps(*h2opus->hmatrix, filename);
1162 }
1163 PetscFunctionReturn(PETSC_SUCCESS);
1164 }
1165
MatH2OpusSetCoords_H2OPUS(Mat A,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,MatH2OpusKernelFn * kernel,void * kernelctx)1166 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx)
1167 {
1168 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1169 PetscReal *gcoords;
1170 PetscInt N;
1171 MPI_Comm comm;
1172 PetscMPIInt size;
1173 PetscBool cong;
1174
1175 PetscFunctionBegin;
1176 PetscCall(PetscLayoutSetUp(A->rmap));
1177 PetscCall(PetscLayoutSetUp(A->cmap));
1178 PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1179 PetscCall(MatHasCongruentLayouts(A, &cong));
1180 PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1181 N = A->rmap->N;
1182 PetscCallMPI(MPI_Comm_size(comm, &size));
1183 if (spacedim > 0 && size > 1 && cdist) {
1184 PetscSF sf;
1185 MPI_Datatype dtype;
1186
1187 PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1188 PetscCallMPI(MPI_Type_commit(&dtype));
1189
1190 PetscCall(PetscSFCreate(comm, &sf));
1191 PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1192 PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1193 PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1194 PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1195 PetscCall(PetscSFDestroy(&sf));
1196 PetscCallMPI(MPI_Type_free(&dtype));
1197 } else gcoords = (PetscReal *)coords;
1198
1199 delete h2opus->ptcloud;
1200 delete h2opus->kernel;
1201 h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1202 if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1203 if (gcoords != coords) PetscCall(PetscFree(gcoords));
1204 A->preallocated = PETSC_TRUE;
1205 PetscFunctionReturn(PETSC_SUCCESS);
1206 }
1207
1208 #if defined(PETSC_H2OPUS_USE_GPU)
MatBindToCPU_H2OPUS(Mat A,PetscBool flg)1209 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1210 {
1211 PetscMPIInt size;
1212 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1213
1214 PetscFunctionBegin;
1215 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1216 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1217 if (size > 1) {
1218 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1219 #if defined(H2OPUS_USE_MPI)
1220 if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1221 else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1222 #endif
1223 } else {
1224 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1225 if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1226 else *a->hmatrix = *a->hmatrix_gpu;
1227 }
1228 delete a->hmatrix_gpu;
1229 delete a->dist_hmatrix_gpu;
1230 a->hmatrix_gpu = NULL;
1231 a->dist_hmatrix_gpu = NULL;
1232 A->offloadmask = PETSC_OFFLOAD_CPU;
1233 } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1234 if (size > 1) {
1235 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1236 #if defined(H2OPUS_USE_MPI)
1237 if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1238 else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1239 #endif
1240 } else {
1241 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1242 if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1243 else *a->hmatrix_gpu = *a->hmatrix;
1244 }
1245 delete a->hmatrix;
1246 delete a->dist_hmatrix;
1247 a->hmatrix = NULL;
1248 a->dist_hmatrix = NULL;
1249 A->offloadmask = PETSC_OFFLOAD_GPU;
1250 }
1251 PetscCall(PetscFree(A->defaultvectype));
1252 if (!flg) {
1253 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1254 } else {
1255 PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1256 }
1257 A->boundtocpu = flg;
1258 PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260 #endif
1261
1262 /*MC
1263 MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.
1264
1265 Options Database Key:
1266 . -mat_type h2opus - matrix type to "h2opus"
1267
1268 Level: beginner
1269
1270 Notes:
1271 H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.
1272
1273 For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1274 In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.
1275
1276 .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1277 M*/
MatCreate_H2OPUS(Mat A)1278 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1279 {
1280 Mat_H2OPUS *a;
1281 PetscMPIInt size;
1282
1283 PetscFunctionBegin;
1284 #if defined(PETSC_H2OPUS_USE_GPU)
1285 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1286 #endif
1287 PetscCall(PetscNew(&a));
1288 A->data = (void *)a;
1289
1290 a->eta = 0.9;
1291 a->leafsize = 32;
1292 a->basisord = 4;
1293 a->max_rank = 64;
1294 a->bs = 32;
1295 a->rtol = 1.e-4;
1296 a->s = 1.0;
1297 a->norm_max_samples = 10;
1298 a->resize = PETSC_TRUE; /* reallocate after compression */
1299 #if defined(H2OPUS_USE_MPI)
1300 h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1301 #else
1302 h2opusCreateHandle(&a->handle);
1303 #endif
1304 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1305 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1306 PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1307
1308 A->ops->destroy = MatDestroy_H2OPUS;
1309 A->ops->view = MatView_H2OPUS;
1310 A->ops->assemblyend = MatAssemblyEnd_H2OPUS;
1311 A->ops->mult = MatMult_H2OPUS;
1312 A->ops->multtranspose = MatMultTranspose_H2OPUS;
1313 A->ops->multadd = MatMultAdd_H2OPUS;
1314 A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1315 A->ops->scale = MatScale_H2OPUS;
1316 A->ops->duplicate = MatDuplicate_H2OPUS;
1317 A->ops->setfromoptions = MatSetFromOptions_H2OPUS;
1318 A->ops->norm = MatNorm_H2OPUS;
1319 A->ops->zeroentries = MatZeroEntries_H2OPUS;
1320 #if defined(PETSC_H2OPUS_USE_GPU)
1321 A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1322 #endif
1323
1324 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1325 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1326 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1327 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1328 #if defined(PETSC_H2OPUS_USE_GPU)
1329 PetscCall(PetscFree(A->defaultvectype));
1330 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1331 #endif
1332 PetscFunctionReturn(PETSC_SUCCESS);
1333 }
1334
1335 /*@
1336 MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1337
1338 Input Parameter:
1339 . A - the matrix
1340
1341 Level: intermediate
1342
1343 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1344 @*/
MatH2OpusOrthogonalize(Mat A)1345 PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1346 {
1347 PetscBool ish2opus;
1348 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1349 PetscMPIInt size;
1350 PetscBool boundtocpu = PETSC_TRUE;
1351
1352 PetscFunctionBegin;
1353 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1354 PetscValidType(A, 1);
1355 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1356 if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1357 if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1358 HLibProfile::clear();
1359 PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1360 #if defined(PETSC_H2OPUS_USE_GPU)
1361 boundtocpu = A->boundtocpu;
1362 #endif
1363 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1364 if (size > 1) {
1365 if (boundtocpu) {
1366 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1367 #if defined(H2OPUS_USE_MPI)
1368 distributed_horthog(*a->dist_hmatrix, a->handle);
1369 #endif
1370 #if defined(PETSC_H2OPUS_USE_GPU)
1371 A->offloadmask = PETSC_OFFLOAD_CPU;
1372 } else {
1373 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1374 PetscCall(PetscLogGpuTimeBegin());
1375 #if defined(H2OPUS_USE_MPI)
1376 distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1377 #endif
1378 PetscCall(PetscLogGpuTimeEnd());
1379 #endif
1380 }
1381 } else {
1382 #if defined(H2OPUS_USE_MPI)
1383 h2opusHandle_t handle = a->handle->handle;
1384 #else
1385 h2opusHandle_t handle = a->handle;
1386 #endif
1387 if (boundtocpu) {
1388 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1389 horthog(*a->hmatrix, handle);
1390 #if defined(PETSC_H2OPUS_USE_GPU)
1391 A->offloadmask = PETSC_OFFLOAD_CPU;
1392 } else {
1393 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1394 PetscCall(PetscLogGpuTimeBegin());
1395 horthog(*a->hmatrix_gpu, handle);
1396 PetscCall(PetscLogGpuTimeEnd());
1397 #endif
1398 }
1399 }
1400 a->orthogonal = PETSC_TRUE;
1401 { /* log flops */
1402 double gops, time, perf, dev;
1403 HLibProfile::getHorthogPerf(gops, time, perf, dev);
1404 #if defined(PETSC_H2OPUS_USE_GPU)
1405 if (boundtocpu) {
1406 PetscCall(PetscLogFlops(1e9 * gops));
1407 } else {
1408 PetscCall(PetscLogGpuFlops(1e9 * gops));
1409 }
1410 #else
1411 PetscCall(PetscLogFlops(1e9 * gops));
1412 #endif
1413 }
1414 PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1415 PetscFunctionReturn(PETSC_SUCCESS);
1416 }
1417
1418 /*@
1419 MatH2OpusCompress - Compress a hierarchical matrix.
1420
1421 Input Parameters:
1422 + A - the matrix
1423 - tol - the absolute truncation threshold
1424
1425 Level: intermediate
1426
1427 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1428 @*/
MatH2OpusCompress(Mat A,PetscReal tol)1429 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1430 {
1431 PetscBool ish2opus;
1432 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1433 PetscMPIInt size;
1434 PetscBool boundtocpu = PETSC_TRUE;
1435
1436 PetscFunctionBegin;
1437 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1438 PetscValidType(A, 1);
1439 PetscValidLogicalCollectiveReal(A, tol, 2);
1440 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1441 if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1442 PetscCall(MatH2OpusOrthogonalize(A));
1443 HLibProfile::clear();
1444 PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1445 #if defined(PETSC_H2OPUS_USE_GPU)
1446 boundtocpu = A->boundtocpu;
1447 #endif
1448 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1449 if (size > 1) {
1450 if (boundtocpu) {
1451 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1452 #if defined(H2OPUS_USE_MPI)
1453 distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1454 if (a->resize) {
1455 DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1456 delete a->dist_hmatrix;
1457 a->dist_hmatrix = dist_hmatrix;
1458 }
1459 #endif
1460 #if defined(PETSC_H2OPUS_USE_GPU)
1461 A->offloadmask = PETSC_OFFLOAD_CPU;
1462 } else {
1463 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1464 PetscCall(PetscLogGpuTimeBegin());
1465 #if defined(H2OPUS_USE_MPI)
1466 distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1467
1468 if (a->resize) {
1469 DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1470 delete a->dist_hmatrix_gpu;
1471 a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1472 }
1473 #endif
1474 PetscCall(PetscLogGpuTimeEnd());
1475 #endif
1476 }
1477 } else {
1478 #if defined(H2OPUS_USE_MPI)
1479 h2opusHandle_t handle = a->handle->handle;
1480 #else
1481 h2opusHandle_t handle = a->handle;
1482 #endif
1483 if (boundtocpu) {
1484 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1485 hcompress(*a->hmatrix, tol, handle);
1486
1487 if (a->resize) {
1488 HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1489 delete a->hmatrix;
1490 a->hmatrix = hmatrix;
1491 }
1492 #if defined(PETSC_H2OPUS_USE_GPU)
1493 A->offloadmask = PETSC_OFFLOAD_CPU;
1494 } else {
1495 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1496 PetscCall(PetscLogGpuTimeBegin());
1497 hcompress(*a->hmatrix_gpu, tol, handle);
1498 PetscCall(PetscLogGpuTimeEnd());
1499
1500 if (a->resize) {
1501 HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1502 delete a->hmatrix_gpu;
1503 a->hmatrix_gpu = hmatrix_gpu;
1504 }
1505 #endif
1506 }
1507 }
1508 { /* log flops */
1509 double gops, time, perf, dev;
1510 HLibProfile::getHcompressPerf(gops, time, perf, dev);
1511 #if defined(PETSC_H2OPUS_USE_GPU)
1512 if (boundtocpu) {
1513 PetscCall(PetscLogFlops(1e9 * gops));
1514 } else {
1515 PetscCall(PetscLogGpuFlops(1e9 * gops));
1516 }
1517 #else
1518 PetscCall(PetscLogFlops(1e9 * gops));
1519 #endif
1520 }
1521 PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1522 PetscFunctionReturn(PETSC_SUCCESS);
1523 }
1524
1525 /*@
1526 MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.
1527
1528 Input Parameters:
1529 + A - the hierarchical matrix
1530 . B - the matrix to be sampled
1531 . bs - maximum number of samples to be taken concurrently
1532 - tol - relative tolerance for construction
1533
1534 Level: intermediate
1535
1536 Notes:
1537 You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1538
1539 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1540 @*/
MatH2OpusSetSamplingMat(Mat A,Mat B,PetscInt bs,PetscReal tol)1541 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1542 {
1543 PetscBool ish2opus;
1544
1545 PetscFunctionBegin;
1546 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1547 PetscValidType(A, 1);
1548 if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
1549 PetscValidLogicalCollectiveInt(A, bs, 3);
1550 PetscValidLogicalCollectiveReal(A, tol, 4);
1551 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1552 if (ish2opus) {
1553 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1554
1555 if (!a->sampler) a->sampler = new PetscMatrixSampler();
1556 a->sampler->SetSamplingMat(B);
1557 if (bs > 0) a->bs = bs;
1558 if (tol > 0.) a->rtol = tol;
1559 delete a->kernel;
1560 }
1561 PetscFunctionReturn(PETSC_SUCCESS);
1562 }
1563
1564 /*@C
1565 MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1566
1567 Input Parameters:
1568 + comm - MPI communicator
1569 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1570 . n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1571 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1572 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1573 . spacedim - dimension of the space coordinates
1574 . coords - coordinates of the points
1575 . cdist - whether or not coordinates are distributed
1576 . kernel - computational kernel (or `NULL`)
1577 . kernelctx - kernel context
1578 . eta - admissibility condition tolerance
1579 . leafsize - leaf size in cluster tree
1580 - basisord - approximation order for Chebychev interpolation of low-rank blocks
1581
1582 Output Parameter:
1583 . nA - matrix
1584
1585 Options Database Keys:
1586 + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1587 . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1588 . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
1589 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1590
1591 Level: intermediate
1592
1593 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1594 @*/
MatCreateH2OpusFromKernel(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,MatH2OpusKernelFn * kernel,void * kernelctx,PetscReal eta,PetscInt leafsize,PetscInt basisord,Mat * nA)1595 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1596 {
1597 Mat A;
1598 Mat_H2OPUS *h2opus;
1599 PetscBool iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1600
1601 PetscFunctionBegin;
1602 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1603 PetscCall(MatCreate(comm, &A));
1604 PetscCall(MatSetSizes(A, m, n, M, N));
1605 PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1606 PetscCall(MatSetType(A, MATH2OPUS));
1607 PetscCall(MatBindToCPU(A, iscpu));
1608 PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1609
1610 h2opus = (Mat_H2OPUS *)A->data;
1611 if (eta > 0.) h2opus->eta = eta;
1612 if (leafsize > 0) h2opus->leafsize = leafsize;
1613 if (basisord > 0) h2opus->basisord = basisord;
1614
1615 *nA = A;
1616 PetscFunctionReturn(PETSC_SUCCESS);
1617 }
1618
1619 /*@
1620 MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1621
1622 Input Parameters:
1623 + B - the matrix to be sampled
1624 . spacedim - dimension of the space coordinates
1625 . coords - coordinates of the points
1626 . cdist - whether or not coordinates are distributed
1627 . eta - admissibility condition tolerance
1628 . leafsize - leaf size in cluster tree
1629 . maxrank - maximum rank allowed
1630 . bs - maximum number of samples to be taken concurrently
1631 - rtol - relative tolerance for construction
1632
1633 Output Parameter:
1634 . nA - matrix
1635
1636 Options Database Keys:
1637 + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1638 . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1639 . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
1640 . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
1641 . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
1642 . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
1643 . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1644 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms
1645
1646 Level: intermediate
1647
1648 Note:
1649 Not available in parallel
1650
1651 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1652 @*/
MatCreateH2OpusFromMat(Mat B,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,PetscReal eta,PetscInt leafsize,PetscInt maxrank,PetscInt bs,PetscReal rtol,Mat * nA)1653 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1654 {
1655 Mat A;
1656 Mat_H2OPUS *h2opus;
1657 MPI_Comm comm;
1658 PetscBool boundtocpu = PETSC_TRUE;
1659
1660 PetscFunctionBegin;
1661 PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
1662 PetscValidLogicalCollectiveInt(B, spacedim, 2);
1663 PetscValidLogicalCollectiveBool(B, cdist, 4);
1664 PetscValidLogicalCollectiveReal(B, eta, 5);
1665 PetscValidLogicalCollectiveInt(B, leafsize, 6);
1666 PetscValidLogicalCollectiveInt(B, maxrank, 7);
1667 PetscValidLogicalCollectiveInt(B, bs, 8);
1668 PetscValidLogicalCollectiveReal(B, rtol, 9);
1669 PetscAssertPointer(nA, 10);
1670 PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1671 PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1672 PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1673 PetscCall(MatCreate(comm, &A));
1674 PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1675 #if defined(PETSC_H2OPUS_USE_GPU)
1676 {
1677 VecType vtype;
1678 PetscBool isstd, iscuda, iskok;
1679
1680 PetscCall(MatGetVecType(B, &vtype));
1681 PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1682 PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1683 PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1684 PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1685 if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1686 if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1687 }
1688 #endif
1689 PetscCall(MatSetType(A, MATH2OPUS));
1690 PetscCall(MatBindToCPU(A, boundtocpu));
1691 if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1692 PetscCall(MatPropagateSymmetryOptions(B, A));
1693 /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1694
1695 h2opus = (Mat_H2OPUS *)A->data;
1696 h2opus->sampler = new PetscMatrixSampler(B);
1697 if (eta > 0.) h2opus->eta = eta;
1698 if (leafsize > 0) h2opus->leafsize = leafsize;
1699 if (maxrank > 0) h2opus->max_rank = maxrank;
1700 if (bs > 0) h2opus->bs = bs;
1701 if (rtol > 0.) h2opus->rtol = rtol;
1702 *nA = A;
1703 A->preallocated = PETSC_TRUE;
1704 PetscFunctionReturn(PETSC_SUCCESS);
1705 }
1706
1707 /*@
1708 MatH2OpusGetIndexMap - Access reordering index set.
1709
1710 Input Parameter:
1711 . A - the matrix
1712
1713 Output Parameter:
1714 . indexmap - the index set for the reordering
1715
1716 Level: intermediate
1717
1718 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1719 @*/
MatH2OpusGetIndexMap(Mat A,IS * indexmap)1720 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1721 {
1722 PetscBool ish2opus;
1723 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1724
1725 PetscFunctionBegin;
1726 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1727 PetscValidType(A, 1);
1728 PetscAssertPointer(indexmap, 2);
1729 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1730 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1731 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1732 *indexmap = a->h2opus_indexmap;
1733 PetscFunctionReturn(PETSC_SUCCESS);
1734 }
1735
1736 /*@
1737 MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1738
1739 Input Parameters:
1740 + A - the matrix
1741 . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1742 - in - the vector to be mapped
1743
1744 Output Parameter:
1745 . out - the newly created mapped vector
1746
1747 Level: intermediate
1748
1749 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1750 @*/
MatH2OpusMapVec(Mat A,PetscBool nativetopetsc,Vec in,Vec * out)1751 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1752 {
1753 PetscBool ish2opus;
1754 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1755 PetscScalar *xin, *xout;
1756 PetscBool nm;
1757
1758 PetscFunctionBegin;
1759 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1760 PetscValidType(A, 1);
1761 PetscValidLogicalCollectiveBool(A, nativetopetsc, 2);
1762 PetscValidHeaderSpecific(in, VEC_CLASSID, 3);
1763 PetscAssertPointer(out, 4);
1764 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1765 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1766 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1767 nm = a->nativemult;
1768 PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1769 PetscCall(MatCreateVecs(A, out, NULL));
1770 PetscCall(MatH2OpusSetNativeMult(A, nm));
1771 if (!a->sf) { /* same ordering */
1772 PetscCall(VecCopy(in, *out));
1773 PetscFunctionReturn(PETSC_SUCCESS);
1774 }
1775 PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1776 PetscCall(VecGetArrayWrite(*out, &xout));
1777 if (nativetopetsc) {
1778 PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1779 PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1780 } else {
1781 PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1782 PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1783 }
1784 PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1785 PetscCall(VecRestoreArrayWrite(*out, &xout));
1786 PetscFunctionReturn(PETSC_SUCCESS);
1787 }
1788
1789 /*@
1790 MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $
1791
1792 Input Parameters:
1793 + A - the hierarchical `MATH2OPUS` matrix
1794 . s - the scaling factor
1795 . U - the dense low-rank update matrix
1796 - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)
1797
1798 Note:
1799 The `U` and `V` matrices must be in `MATDENSE` dense format
1800
1801 Level: intermediate
1802
1803 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1804 @*/
MatH2OpusLowRankUpdate(Mat A,Mat U,Mat V,PetscScalar s)1805 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1806 {
1807 PetscBool flg;
1808
1809 PetscFunctionBegin;
1810 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1811 PetscValidType(A, 1);
1812 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1813 PetscValidHeaderSpecific(U, MAT_CLASSID, 2);
1814 PetscCheckSameComm(A, 1, U, 2);
1815 if (V) {
1816 PetscValidHeaderSpecific(V, MAT_CLASSID, 3);
1817 PetscCheckSameComm(A, 1, V, 3);
1818 }
1819 PetscValidLogicalCollectiveScalar(A, s, 4);
1820
1821 if (!V) V = U;
1822 PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1823 if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1824 PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1825 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1826 PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1827 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1828 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1829 if (flg) {
1830 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1831 const PetscScalar *u, *v, *uu, *vv;
1832 PetscInt ldu, ldv;
1833 PetscMPIInt size;
1834 #if defined(H2OPUS_USE_MPI)
1835 h2opusHandle_t handle = a->handle->handle;
1836 #else
1837 h2opusHandle_t handle = a->handle;
1838 #endif
1839 PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1840 PetscSF usf, vsf;
1841
1842 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1843 PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1844 PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1845 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1846 PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1847 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1848 PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1849 PetscCall(MatDenseGetLDA(U, &ldu));
1850 PetscCall(MatDenseGetLDA(V, &ldv));
1851 PetscCall(MatBoundToCPU(A, &flg));
1852 if (usesf) {
1853 PetscInt n;
1854
1855 PetscCall(MatDenseGetH2OpusStridedSF(U, a->sf, &usf));
1856 PetscCall(MatDenseGetH2OpusStridedSF(V, a->sf, &vsf));
1857 PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1858 PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1859 ldu = n;
1860 ldv = n;
1861 }
1862 if (flg) {
1863 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1864 PetscCall(MatDenseGetArrayRead(U, &u));
1865 PetscCall(MatDenseGetArrayRead(V, &v));
1866 if (usesf) {
1867 vv = MatH2OpusGetThrustPointer(*a->yy);
1868 PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1869 PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1870 if (U != V) {
1871 uu = MatH2OpusGetThrustPointer(*a->xx);
1872 PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1873 PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1874 } else uu = vv;
1875 } else {
1876 uu = u;
1877 vv = v;
1878 }
1879 hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1880 PetscCall(MatDenseRestoreArrayRead(U, &u));
1881 PetscCall(MatDenseRestoreArrayRead(V, &v));
1882 } else {
1883 #if defined(PETSC_H2OPUS_USE_GPU)
1884 PetscBool flgU, flgV;
1885
1886 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1887 PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1888 if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1889 PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1890 if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1891 PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1892 PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1893 if (usesf) {
1894 vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1895 PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1896 PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1897 if (U != V) {
1898 uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1899 PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1900 PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1901 } else uu = vv;
1902 } else {
1903 uu = u;
1904 vv = v;
1905 }
1906 #else
1907 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1908 #endif
1909 hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1910 #if defined(PETSC_H2OPUS_USE_GPU)
1911 PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1912 PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1913 if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1914 if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1915 #endif
1916 }
1917 PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1918 a->orthogonal = PETSC_FALSE;
1919 }
1920 PetscFunctionReturn(PETSC_SUCCESS);
1921 }
1922 #endif
1923