1f17f7ee2SSatish Balay #include <h2opusconf.h>
2f17f7ee2SSatish Balay /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3f17f7ee2SSatish Balay #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4f17f7ee2SSatish Balay #include <h2opus.h>
5f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
6f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_h2opus_handle.h>
7f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_geometric_construction.h>
8f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hgemv.h>
9f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_horthog.h>
10f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hcompress.h>
11f17f7ee2SSatish Balay #endif
12f17f7ee2SSatish Balay #include <h2opus/util/boxentrygen.h>
13f17f7ee2SSatish Balay #include <petsc/private/matimpl.h>
14f17f7ee2SSatish Balay #include <petsc/private/vecimpl.h>
15cc4c1da9SBarry Smith #include <petsc/private/deviceimpl.h> /*I "petscmat.h" I*/
16f17f7ee2SSatish Balay #include <petscsf.h>
17f17f7ee2SSatish Balay
18f17f7ee2SSatish Balay /* math2opusutils */
190dd791a8SStefano Zampini PETSC_INTERN PetscErrorCode MatDenseGetH2OpusStridedSF(Mat, PetscSF, PetscSF *);
20f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
21f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
22f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
23f17f7ee2SSatish Balay
24f17f7ee2SSatish Balay #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
25f17f7ee2SSatish Balay
26f17f7ee2SSatish Balay /* Use GPU only if H2OPUS is configured for GPU */
27f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
28f17f7ee2SSatish Balay #define PETSC_H2OPUS_USE_GPU
29f17f7ee2SSatish Balay #endif
30f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
31f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
32f17f7ee2SSatish Balay #else
333ba16761SJacob Faibussowitsch #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
34f17f7ee2SSatish Balay #endif
35f17f7ee2SSatish Balay
36f17f7ee2SSatish Balay // TODO H2OPUS:
37f17f7ee2SSatish Balay // DistributedHMatrix
38f17f7ee2SSatish Balay // unsymmetric ?
39f17f7ee2SSatish Balay // transpose for distributed_hgemv?
40f17f7ee2SSatish Balay // clearData()
41f17f7ee2SSatish Balay // Unify interface for sequential and parallel?
42f17f7ee2SSatish Balay // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
43f17f7ee2SSatish Balay //
449371c9d4SSatish Balay template <class T>
459371c9d4SSatish Balay class PetscPointCloud : public H2OpusDataSet<T> {
46f17f7ee2SSatish Balay private:
47f17f7ee2SSatish Balay int dimension;
48f17f7ee2SSatish Balay size_t num_points;
49f17f7ee2SSatish Balay std::vector<T> pts;
50f17f7ee2SSatish Balay
51f17f7ee2SSatish Balay public:
PetscPointCloud(int dim,size_t num_pts,const T coords[])52d71ae5a4SJacob Faibussowitsch PetscPointCloud(int dim, size_t num_pts, const T coords[])
53d71ae5a4SJacob Faibussowitsch {
54f17f7ee2SSatish Balay dim = dim > 0 ? dim : 1;
55f17f7ee2SSatish Balay this->dimension = dim;
56f17f7ee2SSatish Balay this->num_points = num_pts;
57f17f7ee2SSatish Balay
58f17f7ee2SSatish Balay pts.resize(num_pts * dim);
59f17f7ee2SSatish Balay if (coords) {
60036e4bdcSSatish Balay for (size_t n = 0; n < num_pts; n++)
619371c9d4SSatish Balay for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
62f17f7ee2SSatish Balay } else {
63036e4bdcSSatish Balay PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
64036e4bdcSSatish Balay for (size_t n = 0; n < num_pts; n++) {
65036e4bdcSSatish Balay pts[n * dim] = n * h;
669371c9d4SSatish Balay for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
67036e4bdcSSatish Balay }
68f17f7ee2SSatish Balay }
69f17f7ee2SSatish Balay }
70f17f7ee2SSatish Balay
PetscPointCloud(const PetscPointCloud<T> & other)71d71ae5a4SJacob Faibussowitsch PetscPointCloud(const PetscPointCloud<T> &other)
72d71ae5a4SJacob Faibussowitsch {
73f17f7ee2SSatish Balay size_t N = other.dimension * other.num_points;
74f17f7ee2SSatish Balay this->dimension = other.dimension;
75f17f7ee2SSatish Balay this->num_points = other.num_points;
76f17f7ee2SSatish Balay this->pts.resize(N);
779371c9d4SSatish Balay for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
78f17f7ee2SSatish Balay }
79f17f7ee2SSatish Balay
getDimension() const809371c9d4SSatish Balay int getDimension() const { return dimension; }
81f17f7ee2SSatish Balay
getDataSetSize() const829371c9d4SSatish Balay size_t getDataSetSize() const { return num_points; }
83f17f7ee2SSatish Balay
getDataPoint(size_t idx,int dim) const84d71ae5a4SJacob Faibussowitsch T getDataPoint(size_t idx, int dim) const
85d71ae5a4SJacob Faibussowitsch {
86f17f7ee2SSatish Balay assert(dim < dimension && idx < num_points);
87f17f7ee2SSatish Balay return pts[idx * dimension + dim];
88f17f7ee2SSatish Balay }
89f17f7ee2SSatish Balay
Print(std::ostream & out=std::cout)90d71ae5a4SJacob Faibussowitsch void Print(std::ostream &out = std::cout)
91d71ae5a4SJacob Faibussowitsch {
92f17f7ee2SSatish Balay out << "Dimension: " << dimension << std::endl;
93f17f7ee2SSatish Balay out << "NumPoints: " << num_points << std::endl;
94f17f7ee2SSatish Balay for (size_t n = 0; n < num_points; n++) {
959371c9d4SSatish Balay for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
96f17f7ee2SSatish Balay out << std::endl;
97f17f7ee2SSatish Balay }
98f17f7ee2SSatish Balay }
99f17f7ee2SSatish Balay };
100f17f7ee2SSatish Balay
1019371c9d4SSatish Balay template <class T>
1029371c9d4SSatish Balay class PetscFunctionGenerator {
103f17f7ee2SSatish Balay private:
1048434afd1SBarry Smith MatH2OpusKernelFn *k;
105f17f7ee2SSatish Balay int dim;
106f17f7ee2SSatish Balay void *ctx;
107f17f7ee2SSatish Balay
108f17f7ee2SSatish Balay public:
PetscFunctionGenerator(MatH2OpusKernelFn * k,int dim,PetscCtx ctx)1092a8381b2SBarry Smith PetscFunctionGenerator(MatH2OpusKernelFn *k, int dim, PetscCtx ctx)
110d71ae5a4SJacob Faibussowitsch {
1119371c9d4SSatish Balay this->k = k;
1129371c9d4SSatish Balay this->dim = dim;
1139371c9d4SSatish Balay this->ctx = ctx;
114f17f7ee2SSatish Balay }
PetscFunctionGenerator(PetscFunctionGenerator & other)115d71ae5a4SJacob Faibussowitsch PetscFunctionGenerator(PetscFunctionGenerator &other)
116d71ae5a4SJacob Faibussowitsch {
1179371c9d4SSatish Balay this->k = other.k;
1189371c9d4SSatish Balay this->dim = other.dim;
1199371c9d4SSatish Balay this->ctx = other.ctx;
1209371c9d4SSatish Balay }
operator ()(PetscReal * pt1,PetscReal * pt2)1219371c9d4SSatish Balay T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
122f17f7ee2SSatish Balay };
123f17f7ee2SSatish Balay
124f17f7ee2SSatish Balay #include <../src/mat/impls/h2opus/math2opussampler.hpp>
125f17f7ee2SSatish Balay
126f17f7ee2SSatish Balay /* just to not clutter the code */
127f17f7ee2SSatish Balay #if !defined(H2OPUS_USE_GPU)
128f17f7ee2SSatish Balay typedef HMatrix HMatrix_GPU;
129f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
130f17f7ee2SSatish Balay typedef DistributedHMatrix DistributedHMatrix_GPU;
131f17f7ee2SSatish Balay #endif
132f17f7ee2SSatish Balay #endif
133f17f7ee2SSatish Balay
134f17f7ee2SSatish Balay typedef struct {
135f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
136f17f7ee2SSatish Balay distributedH2OpusHandle_t handle;
137f17f7ee2SSatish Balay #else
138f17f7ee2SSatish Balay h2opusHandle_t handle;
139f17f7ee2SSatish Balay #endif
140f17f7ee2SSatish Balay /* Sequential and parallel matrices are two different classes at the moment */
141f17f7ee2SSatish Balay HMatrix *hmatrix;
142f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
143f17f7ee2SSatish Balay DistributedHMatrix *dist_hmatrix;
144f17f7ee2SSatish Balay #else
145f17f7ee2SSatish Balay HMatrix *dist_hmatrix; /* just to not clutter the code */
146f17f7ee2SSatish Balay #endif
147f17f7ee2SSatish Balay /* May use permutations */
148f17f7ee2SSatish Balay PetscSF sf;
149f17f7ee2SSatish Balay PetscLayout h2opus_rmap, h2opus_cmap;
150f17f7ee2SSatish Balay IS h2opus_indexmap;
151f17f7ee2SSatish Balay thrust::host_vector<PetscScalar> *xx, *yy;
152f17f7ee2SSatish Balay PetscInt xxs, yys;
153f17f7ee2SSatish Balay PetscBool multsetup;
154f17f7ee2SSatish Balay
155f17f7ee2SSatish Balay /* GPU */
156f17f7ee2SSatish Balay HMatrix_GPU *hmatrix_gpu;
157f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
158f17f7ee2SSatish Balay DistributedHMatrix_GPU *dist_hmatrix_gpu;
159f17f7ee2SSatish Balay #else
160f17f7ee2SSatish Balay HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
161f17f7ee2SSatish Balay #endif
162f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
163f17f7ee2SSatish Balay thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
164f17f7ee2SSatish Balay PetscInt xxs_gpu, yys_gpu;
165f17f7ee2SSatish Balay #endif
166f17f7ee2SSatish Balay
167f17f7ee2SSatish Balay /* construction from matvecs */
168f17f7ee2SSatish Balay PetscMatrixSampler *sampler;
169f17f7ee2SSatish Balay PetscBool nativemult;
170f17f7ee2SSatish Balay
171f17f7ee2SSatish Balay /* Admissibility */
172f17f7ee2SSatish Balay PetscReal eta;
173f17f7ee2SSatish Balay PetscInt leafsize;
174f17f7ee2SSatish Balay
175f17f7ee2SSatish Balay /* for dof reordering */
176f17f7ee2SSatish Balay PetscPointCloud<PetscReal> *ptcloud;
177f17f7ee2SSatish Balay
178f17f7ee2SSatish Balay /* kernel for generating matrix entries */
179f17f7ee2SSatish Balay PetscFunctionGenerator<PetscScalar> *kernel;
180f17f7ee2SSatish Balay
181f17f7ee2SSatish Balay /* basis orthogonalized? */
182f17f7ee2SSatish Balay PetscBool orthogonal;
183f17f7ee2SSatish Balay
184f17f7ee2SSatish Balay /* customization */
185f17f7ee2SSatish Balay PetscInt basisord;
186f17f7ee2SSatish Balay PetscInt max_rank;
187f17f7ee2SSatish Balay PetscInt bs;
188f17f7ee2SSatish Balay PetscReal rtol;
189f17f7ee2SSatish Balay PetscInt norm_max_samples;
190f17f7ee2SSatish Balay PetscBool check_construction;
191f17f7ee2SSatish Balay PetscBool hara_verbose;
192036e4bdcSSatish Balay PetscBool resize;
193f17f7ee2SSatish Balay
194f17f7ee2SSatish Balay /* keeps track of MatScale values */
195f17f7ee2SSatish Balay PetscScalar s;
196f17f7ee2SSatish Balay } Mat_H2OPUS;
197f17f7ee2SSatish Balay
MatDestroy_H2OPUS(Mat A)198d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_H2OPUS(Mat A)
199d71ae5a4SJacob Faibussowitsch {
200f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
201f17f7ee2SSatish Balay
202f17f7ee2SSatish Balay PetscFunctionBegin;
203f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
204f17f7ee2SSatish Balay h2opusDestroyDistributedHandle(a->handle);
205f17f7ee2SSatish Balay #else
206f17f7ee2SSatish Balay h2opusDestroyHandle(a->handle);
207f17f7ee2SSatish Balay #endif
208f17f7ee2SSatish Balay delete a->dist_hmatrix;
209f17f7ee2SSatish Balay delete a->hmatrix;
210f17f7ee2SSatish Balay PetscCall(PetscSFDestroy(&a->sf));
211f17f7ee2SSatish Balay PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
212f17f7ee2SSatish Balay PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
213f17f7ee2SSatish Balay PetscCall(ISDestroy(&a->h2opus_indexmap));
214f17f7ee2SSatish Balay delete a->xx;
215f17f7ee2SSatish Balay delete a->yy;
216f17f7ee2SSatish Balay delete a->hmatrix_gpu;
217f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu;
218f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
219f17f7ee2SSatish Balay delete a->xx_gpu;
220f17f7ee2SSatish Balay delete a->yy_gpu;
221f17f7ee2SSatish Balay #endif
222f17f7ee2SSatish Balay delete a->sampler;
223f17f7ee2SSatish Balay delete a->ptcloud;
224f17f7ee2SSatish Balay delete a->kernel;
225f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
226f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
227f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
228f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
229f17f7ee2SSatish Balay PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
230f17f7ee2SSatish Balay PetscCall(PetscFree(A->data));
2313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
232f17f7ee2SSatish Balay }
233f17f7ee2SSatish Balay
MatH2OpusSetNativeMult(Mat A,PetscBool nm)234d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235d71ae5a4SJacob Faibussowitsch {
236f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
237f17f7ee2SSatish Balay PetscBool ish2opus;
238f17f7ee2SSatish Balay
239f17f7ee2SSatish Balay PetscFunctionBegin;
240f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
241f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(A, nm, 2);
242f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
243f17f7ee2SSatish Balay if (ish2opus) {
244f17f7ee2SSatish Balay if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
245f17f7ee2SSatish Balay if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
246f17f7ee2SSatish Balay PetscLayout t;
247f17f7ee2SSatish Balay t = A->rmap;
248f17f7ee2SSatish Balay A->rmap = a->h2opus_rmap;
249f17f7ee2SSatish Balay a->h2opus_rmap = t;
250f17f7ee2SSatish Balay t = A->cmap;
251f17f7ee2SSatish Balay A->cmap = a->h2opus_cmap;
252f17f7ee2SSatish Balay a->h2opus_cmap = t;
253f17f7ee2SSatish Balay }
254f17f7ee2SSatish Balay }
255f17f7ee2SSatish Balay a->nativemult = nm;
256f17f7ee2SSatish Balay }
2573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
258f17f7ee2SSatish Balay }
259f17f7ee2SSatish Balay
MatH2OpusGetNativeMult(Mat A,PetscBool * nm)260d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
261d71ae5a4SJacob Faibussowitsch {
262f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
263f17f7ee2SSatish Balay PetscBool ish2opus;
264f17f7ee2SSatish Balay
265f17f7ee2SSatish Balay PetscFunctionBegin;
266f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
2674f572ea9SToby Isaac PetscAssertPointer(nm, 2);
268f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
269f17f7ee2SSatish Balay PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
270f17f7ee2SSatish Balay *nm = a->nativemult;
2713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
272f17f7ee2SSatish Balay }
273f17f7ee2SSatish Balay
MatNorm_H2OPUS(Mat A,NormType normtype,PetscReal * n)274d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
275d71ae5a4SJacob Faibussowitsch {
276f17f7ee2SSatish Balay PetscBool ish2opus;
277f17f7ee2SSatish Balay PetscInt nmax = PETSC_DECIDE;
278f17f7ee2SSatish Balay Mat_H2OPUS *a = NULL;
279f17f7ee2SSatish Balay PetscBool mult = PETSC_FALSE;
280f17f7ee2SSatish Balay
281f17f7ee2SSatish Balay PetscFunctionBegin;
282f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
283f17f7ee2SSatish Balay if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
284f17f7ee2SSatish Balay a = (Mat_H2OPUS *)A->data;
285f17f7ee2SSatish Balay
286f17f7ee2SSatish Balay nmax = a->norm_max_samples;
287f17f7ee2SSatish Balay mult = a->nativemult;
288f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
289f17f7ee2SSatish Balay } else {
290f17f7ee2SSatish Balay PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
291f17f7ee2SSatish Balay }
292f17f7ee2SSatish Balay PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
293f17f7ee2SSatish Balay if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
2943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
295f17f7ee2SSatish Balay }
296f17f7ee2SSatish Balay
MatH2OpusResizeBuffers_Private(Mat A,PetscInt xN,PetscInt yN)297d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
298d71ae5a4SJacob Faibussowitsch {
299f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
300f17f7ee2SSatish Balay PetscInt n;
301f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE;
302f17f7ee2SSatish Balay
303f17f7ee2SSatish Balay PetscFunctionBegin;
304f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
305f17f7ee2SSatish Balay boundtocpu = A->boundtocpu;
306f17f7ee2SSatish Balay #endif
307f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
308f17f7ee2SSatish Balay if (boundtocpu) {
3099371c9d4SSatish Balay if (h2opus->xxs < xN) {
3109371c9d4SSatish Balay h2opus->xx->resize(n * xN);
3119371c9d4SSatish Balay h2opus->xxs = xN;
3129371c9d4SSatish Balay }
3139371c9d4SSatish Balay if (h2opus->yys < yN) {
3149371c9d4SSatish Balay h2opus->yy->resize(n * yN);
3159371c9d4SSatish Balay h2opus->yys = yN;
3169371c9d4SSatish Balay }
317f17f7ee2SSatish Balay }
318f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
319f17f7ee2SSatish Balay if (!boundtocpu) {
3209371c9d4SSatish Balay if (h2opus->xxs_gpu < xN) {
3219371c9d4SSatish Balay h2opus->xx_gpu->resize(n * xN);
3229371c9d4SSatish Balay h2opus->xxs_gpu = xN;
3239371c9d4SSatish Balay }
3249371c9d4SSatish Balay if (h2opus->yys_gpu < yN) {
3259371c9d4SSatish Balay h2opus->yy_gpu->resize(n * yN);
3269371c9d4SSatish Balay h2opus->yys_gpu = yN;
3279371c9d4SSatish Balay }
328f17f7ee2SSatish Balay }
329f17f7ee2SSatish Balay #endif
3303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
331f17f7ee2SSatish Balay }
332f17f7ee2SSatish Balay
MatMultNKernel_H2OPUS(Mat A,PetscBool transA,Mat B,Mat C)333d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
334d71ae5a4SJacob Faibussowitsch {
335f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
336f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
337f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle->handle;
338f17f7ee2SSatish Balay #else
339f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle;
340f17f7ee2SSatish Balay #endif
341f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE;
342f17f7ee2SSatish Balay PetscScalar *xx, *yy, *uxx, *uyy;
343f17f7ee2SSatish Balay PetscInt blda, clda;
344f17f7ee2SSatish Balay PetscMPIInt size;
345f17f7ee2SSatish Balay PetscSF bsf, csf;
346f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
347f17f7ee2SSatish Balay
348f17f7ee2SSatish Balay PetscFunctionBegin;
349f17f7ee2SSatish Balay HLibProfile::clear();
350f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
351f17f7ee2SSatish Balay boundtocpu = A->boundtocpu;
352f17f7ee2SSatish Balay #endif
353f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(B, &blda));
354f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(C, &clda));
355f17f7ee2SSatish Balay if (usesf) {
356f17f7ee2SSatish Balay PetscInt n;
357f17f7ee2SSatish Balay
3580dd791a8SStefano Zampini PetscCall(MatDenseGetH2OpusStridedSF(B, h2opus->sf, &bsf));
3590dd791a8SStefano Zampini PetscCall(MatDenseGetH2OpusStridedSF(C, h2opus->sf, &csf));
360f17f7ee2SSatish Balay
361f17f7ee2SSatish Balay PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
362f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
363f17f7ee2SSatish Balay blda = n;
364f17f7ee2SSatish Balay clda = n;
365f17f7ee2SSatish Balay }
366f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
367f17f7ee2SSatish Balay if (boundtocpu) {
368f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
369f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayWrite(C, &yy));
370f17f7ee2SSatish Balay if (usesf) {
371f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
372f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
373f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
374f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375f17f7ee2SSatish Balay } else {
376f17f7ee2SSatish Balay uxx = xx;
377f17f7ee2SSatish Balay uyy = yy;
378f17f7ee2SSatish Balay }
379f17f7ee2SSatish Balay if (size > 1) {
380f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
381036e4bdcSSatish Balay PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
382f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
383f17f7ee2SSatish Balay distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
384f17f7ee2SSatish Balay #endif
385f17f7ee2SSatish Balay } else {
386f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
387f17f7ee2SSatish Balay hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
388f17f7ee2SSatish Balay }
389f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
390f17f7ee2SSatish Balay if (usesf) {
391f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
392f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393f17f7ee2SSatish Balay }
394f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayWrite(C, &yy));
395f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
396f17f7ee2SSatish Balay } else {
397f17f7ee2SSatish Balay PetscBool ciscuda, biscuda;
398f17f7ee2SSatish Balay
399f17f7ee2SSatish Balay /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
400f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
40148a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
402f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
403f17f7ee2SSatish Balay if (!ciscuda) {
404f17f7ee2SSatish Balay C->assembled = PETSC_TRUE;
405f17f7ee2SSatish Balay PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
406f17f7ee2SSatish Balay }
407f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
408f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
409f17f7ee2SSatish Balay if (usesf) {
410f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
411f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
412f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
413f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414f17f7ee2SSatish Balay } else {
415f17f7ee2SSatish Balay uxx = xx;
416f17f7ee2SSatish Balay uyy = yy;
417f17f7ee2SSatish Balay }
418f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin());
419f17f7ee2SSatish Balay if (size > 1) {
420f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
421036e4bdcSSatish Balay PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
422f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
423f17f7ee2SSatish Balay distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
424f17f7ee2SSatish Balay #endif
425f17f7ee2SSatish Balay } else {
426f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
427f17f7ee2SSatish Balay hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
428f17f7ee2SSatish Balay }
429f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd());
430f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
431f17f7ee2SSatish Balay if (usesf) {
432f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
433f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434f17f7ee2SSatish Balay }
435f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
43648a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
43748a46eb9SPierre Jolivet if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
438f17f7ee2SSatish Balay #endif
439f17f7ee2SSatish Balay }
440f17f7ee2SSatish Balay { /* log flops */
441f17f7ee2SSatish Balay double gops, time, perf, dev;
442f17f7ee2SSatish Balay HLibProfile::getHgemvPerf(gops, time, perf, dev);
443f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
444f17f7ee2SSatish Balay if (boundtocpu) {
445f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops));
446f17f7ee2SSatish Balay } else {
447f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9 * gops));
448f17f7ee2SSatish Balay }
449f17f7ee2SSatish Balay #else
450f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops));
451f17f7ee2SSatish Balay #endif
452f17f7ee2SSatish Balay }
4533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
454f17f7ee2SSatish Balay }
455f17f7ee2SSatish Balay
MatProductNumeric_H2OPUS(Mat C)456d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
457d71ae5a4SJacob Faibussowitsch {
458f17f7ee2SSatish Balay Mat_Product *product = C->product;
459f17f7ee2SSatish Balay
460f17f7ee2SSatish Balay PetscFunctionBegin;
461f17f7ee2SSatish Balay MatCheckProduct(C, 1);
462f17f7ee2SSatish Balay switch (product->type) {
463d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AB:
464d71ae5a4SJacob Faibussowitsch PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
465d71ae5a4SJacob Faibussowitsch break;
466d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AtB:
467d71ae5a4SJacob Faibussowitsch PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
468d71ae5a4SJacob Faibussowitsch break;
469d71ae5a4SJacob Faibussowitsch default:
470d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
471f17f7ee2SSatish Balay }
4723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
473f17f7ee2SSatish Balay }
474f17f7ee2SSatish Balay
MatProductSymbolic_H2OPUS(Mat C)475d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
476d71ae5a4SJacob Faibussowitsch {
477f17f7ee2SSatish Balay Mat_Product *product = C->product;
478f17f7ee2SSatish Balay PetscBool cisdense;
479f17f7ee2SSatish Balay Mat A, B;
480f17f7ee2SSatish Balay
481f17f7ee2SSatish Balay PetscFunctionBegin;
482f17f7ee2SSatish Balay MatCheckProduct(C, 1);
483f17f7ee2SSatish Balay A = product->A;
484f17f7ee2SSatish Balay B = product->B;
485f17f7ee2SSatish Balay switch (product->type) {
486f17f7ee2SSatish Balay case MATPRODUCT_AB:
487f17f7ee2SSatish Balay PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
488f17f7ee2SSatish Balay PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
489f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
490f17f7ee2SSatish Balay if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
491f17f7ee2SSatish Balay PetscCall(MatSetUp(C));
492f17f7ee2SSatish Balay break;
493f17f7ee2SSatish Balay case MATPRODUCT_AtB:
494f17f7ee2SSatish Balay PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
495f17f7ee2SSatish Balay PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
496f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
497f17f7ee2SSatish Balay if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
498f17f7ee2SSatish Balay PetscCall(MatSetUp(C));
499f17f7ee2SSatish Balay break;
500d71ae5a4SJacob Faibussowitsch default:
501d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
502f17f7ee2SSatish Balay }
503f17f7ee2SSatish Balay C->ops->productsymbolic = NULL;
504f17f7ee2SSatish Balay C->ops->productnumeric = MatProductNumeric_H2OPUS;
5053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
506f17f7ee2SSatish Balay }
507f17f7ee2SSatish Balay
MatProductSetFromOptions_H2OPUS(Mat C)508d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
509d71ae5a4SJacob Faibussowitsch {
510f17f7ee2SSatish Balay PetscFunctionBegin;
511f17f7ee2SSatish Balay MatCheckProduct(C, 1);
512ad540459SPierre Jolivet if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
5133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
514f17f7ee2SSatish Balay }
515f17f7ee2SSatish Balay
MatMultKernel_H2OPUS(Mat A,Vec x,PetscScalar sy,Vec y,PetscBool trans)516d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517d71ae5a4SJacob Faibussowitsch {
518f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
519f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
520f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle->handle;
521f17f7ee2SSatish Balay #else
522f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle;
523f17f7ee2SSatish Balay #endif
524f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE;
525f17f7ee2SSatish Balay PetscInt n;
526f17f7ee2SSatish Balay PetscScalar *xx, *yy, *uxx, *uyy;
527f17f7ee2SSatish Balay PetscMPIInt size;
528f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
529f17f7ee2SSatish Balay
530f17f7ee2SSatish Balay PetscFunctionBegin;
531f17f7ee2SSatish Balay HLibProfile::clear();
532f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
533f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
534f17f7ee2SSatish Balay boundtocpu = A->boundtocpu;
535f17f7ee2SSatish Balay #endif
536ac530a7eSPierre Jolivet if (usesf) PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
537ac530a7eSPierre Jolivet else n = A->rmap->n;
538f17f7ee2SSatish Balay if (boundtocpu) {
539f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
540f17f7ee2SSatish Balay if (sy == 0.0) {
541f17f7ee2SSatish Balay PetscCall(VecGetArrayWrite(y, &yy));
542f17f7ee2SSatish Balay } else {
543f17f7ee2SSatish Balay PetscCall(VecGetArray(y, &yy));
544f17f7ee2SSatish Balay }
545f17f7ee2SSatish Balay if (usesf) {
546f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
547f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
548f17f7ee2SSatish Balay
549f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
550f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
551f17f7ee2SSatish Balay if (sy != 0.0) {
552f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
553f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
554f17f7ee2SSatish Balay }
555f17f7ee2SSatish Balay } else {
556f17f7ee2SSatish Balay uxx = xx;
557f17f7ee2SSatish Balay uyy = yy;
558f17f7ee2SSatish Balay }
559f17f7ee2SSatish Balay if (size > 1) {
560f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
561036e4bdcSSatish Balay PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
562f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
563f17f7ee2SSatish Balay distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
564f17f7ee2SSatish Balay #endif
565f17f7ee2SSatish Balay } else {
566f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
567f17f7ee2SSatish Balay hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
568f17f7ee2SSatish Balay }
569f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
570f17f7ee2SSatish Balay if (usesf) {
571f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
572f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
573f17f7ee2SSatish Balay }
574f17f7ee2SSatish Balay if (sy == 0.0) {
575f17f7ee2SSatish Balay PetscCall(VecRestoreArrayWrite(y, &yy));
576f17f7ee2SSatish Balay } else {
577f17f7ee2SSatish Balay PetscCall(VecRestoreArray(y, &yy));
578f17f7ee2SSatish Balay }
579f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
580f17f7ee2SSatish Balay } else {
581f17f7ee2SSatish Balay PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
582f17f7ee2SSatish Balay if (sy == 0.0) {
583f17f7ee2SSatish Balay PetscCall(VecCUDAGetArrayWrite(y, &yy));
584f17f7ee2SSatish Balay } else {
585f17f7ee2SSatish Balay PetscCall(VecCUDAGetArray(y, &yy));
586f17f7ee2SSatish Balay }
587f17f7ee2SSatish Balay if (usesf) {
588f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
589f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
590f17f7ee2SSatish Balay
591f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
592f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
593f17f7ee2SSatish Balay if (sy != 0.0) {
594f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
595f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
596f17f7ee2SSatish Balay }
597f17f7ee2SSatish Balay } else {
598f17f7ee2SSatish Balay uxx = xx;
599f17f7ee2SSatish Balay uyy = yy;
600f17f7ee2SSatish Balay }
601f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin());
602f17f7ee2SSatish Balay if (size > 1) {
603f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
604036e4bdcSSatish Balay PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
605f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
606f17f7ee2SSatish Balay distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
607f17f7ee2SSatish Balay #endif
608f17f7ee2SSatish Balay } else {
609f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
610f17f7ee2SSatish Balay hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
611f17f7ee2SSatish Balay }
612f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd());
613f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
614f17f7ee2SSatish Balay if (usesf) {
615f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
616f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
617f17f7ee2SSatish Balay }
618f17f7ee2SSatish Balay if (sy == 0.0) {
619f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArrayWrite(y, &yy));
620f17f7ee2SSatish Balay } else {
621f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArray(y, &yy));
622f17f7ee2SSatish Balay }
623f17f7ee2SSatish Balay #endif
624f17f7ee2SSatish Balay }
625f17f7ee2SSatish Balay { /* log flops */
626f17f7ee2SSatish Balay double gops, time, perf, dev;
627f17f7ee2SSatish Balay HLibProfile::getHgemvPerf(gops, time, perf, dev);
628f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
629f17f7ee2SSatish Balay if (boundtocpu) {
630f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops));
631f17f7ee2SSatish Balay } else {
632f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9 * gops));
633f17f7ee2SSatish Balay }
634f17f7ee2SSatish Balay #else
635f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops));
636f17f7ee2SSatish Balay #endif
637f17f7ee2SSatish Balay }
6383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
639f17f7ee2SSatish Balay }
640f17f7ee2SSatish Balay
MatMultTranspose_H2OPUS(Mat A,Vec x,Vec y)641d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
642d71ae5a4SJacob Faibussowitsch {
643f17f7ee2SSatish Balay PetscBool xiscuda, yiscuda;
644f17f7ee2SSatish Balay
645f17f7ee2SSatish Balay PetscFunctionBegin;
646f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
647f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
648f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
649f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
6503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
651f17f7ee2SSatish Balay }
652f17f7ee2SSatish Balay
MatMult_H2OPUS(Mat A,Vec x,Vec y)653d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
654d71ae5a4SJacob Faibussowitsch {
655f17f7ee2SSatish Balay PetscBool xiscuda, yiscuda;
656f17f7ee2SSatish Balay
657f17f7ee2SSatish Balay PetscFunctionBegin;
658f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
659f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
660f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
661f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
6623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
663f17f7ee2SSatish Balay }
664f17f7ee2SSatish Balay
MatMultTransposeAdd_H2OPUS(Mat A,Vec x,Vec y,Vec z)665d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
666d71ae5a4SJacob Faibussowitsch {
667f17f7ee2SSatish Balay PetscBool xiscuda, ziscuda;
668f17f7ee2SSatish Balay
669f17f7ee2SSatish Balay PetscFunctionBegin;
670f17f7ee2SSatish Balay PetscCall(VecCopy(y, z));
671f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
672f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
673f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
674f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
6753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
676f17f7ee2SSatish Balay }
677f17f7ee2SSatish Balay
MatMultAdd_H2OPUS(Mat A,Vec x,Vec y,Vec z)678d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
679d71ae5a4SJacob Faibussowitsch {
680f17f7ee2SSatish Balay PetscBool xiscuda, ziscuda;
681f17f7ee2SSatish Balay
682f17f7ee2SSatish Balay PetscFunctionBegin;
683f17f7ee2SSatish Balay PetscCall(VecCopy(y, z));
684f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
685f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
686f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
687f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
6883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
689f17f7ee2SSatish Balay }
690f17f7ee2SSatish Balay
MatScale_H2OPUS(Mat A,PetscScalar s)691d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
692d71ae5a4SJacob Faibussowitsch {
693f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
694f17f7ee2SSatish Balay
695f17f7ee2SSatish Balay PetscFunctionBegin;
696f17f7ee2SSatish Balay a->s *= s;
6973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
698f17f7ee2SSatish Balay }
699f17f7ee2SSatish Balay
MatSetFromOptions_H2OPUS(Mat A,PetscOptionItems PetscOptionsObject)700ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems PetscOptionsObject)
701d71ae5a4SJacob Faibussowitsch {
702f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
703f17f7ee2SSatish Balay
704f17f7ee2SSatish Balay PetscFunctionBegin;
705036e4bdcSSatish Balay PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
706f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
707f17f7ee2SSatish Balay PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
708f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
709f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
710f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
711aaa8cc7dSPierre Jolivet PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
712f17f7ee2SSatish Balay PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
713f17f7ee2SSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
714f17f7ee2SSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
715036e4bdcSSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
716036e4bdcSSatish Balay PetscOptionsHeadEnd();
7173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
718f17f7ee2SSatish Balay }
719f17f7ee2SSatish Balay
7208434afd1SBarry Smith static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernelFn *, void *);
721f17f7ee2SSatish Balay
MatH2OpusInferCoordinates_Private(Mat A)722d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
723d71ae5a4SJacob Faibussowitsch {
724f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
725f17f7ee2SSatish Balay Vec c;
726f17f7ee2SSatish Balay PetscInt spacedim;
727f17f7ee2SSatish Balay const PetscScalar *coords;
728f17f7ee2SSatish Balay
729f17f7ee2SSatish Balay PetscFunctionBegin;
7303ba16761SJacob Faibussowitsch if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
731f17f7ee2SSatish Balay PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
732f17f7ee2SSatish Balay if (!c && a->sampler) {
733f17f7ee2SSatish Balay Mat S = a->sampler->GetSamplingMat();
734f17f7ee2SSatish Balay
735f17f7ee2SSatish Balay PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
736f17f7ee2SSatish Balay }
737f17f7ee2SSatish Balay if (!c) {
738f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
739f17f7ee2SSatish Balay } else {
740f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(c, &coords));
741f17f7ee2SSatish Balay PetscCall(VecGetBlockSize(c, &spacedim));
742f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
743f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(c, &coords));
744f17f7ee2SSatish Balay }
7453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
746f17f7ee2SSatish Balay }
747f17f7ee2SSatish Balay
MatSetUpMultiply_H2OPUS(Mat A)748d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
749d71ae5a4SJacob Faibussowitsch {
750f17f7ee2SSatish Balay MPI_Comm comm;
751f17f7ee2SSatish Balay PetscMPIInt size;
752f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
753f17f7ee2SSatish Balay PetscInt n = 0, *idx = NULL;
754f17f7ee2SSatish Balay int *iidx = NULL;
755f17f7ee2SSatish Balay PetscCopyMode own;
756f17f7ee2SSatish Balay PetscBool rid;
757f17f7ee2SSatish Balay
758f17f7ee2SSatish Balay PetscFunctionBegin;
7593ba16761SJacob Faibussowitsch if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
760f17f7ee2SSatish Balay if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
761f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
762f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
763f17f7ee2SSatish Balay a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
764f17f7ee2SSatish Balay a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
765f17f7ee2SSatish Balay a->xxs_gpu = 1;
766f17f7ee2SSatish Balay a->yys_gpu = 1;
767f17f7ee2SSatish Balay #endif
768f17f7ee2SSatish Balay a->xx = new thrust::host_vector<PetscScalar>(n);
769f17f7ee2SSatish Balay a->yy = new thrust::host_vector<PetscScalar>(n);
770f17f7ee2SSatish Balay a->xxs = 1;
771f17f7ee2SSatish Balay a->yys = 1;
772f17f7ee2SSatish Balay } else {
773f17f7ee2SSatish Balay IS is;
774f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
775f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm, &size));
776f17f7ee2SSatish Balay if (!a->h2opus_indexmap) {
777f17f7ee2SSatish Balay if (size > 1) {
778f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
779f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
780f17f7ee2SSatish Balay iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
781f17f7ee2SSatish Balay n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
782f17f7ee2SSatish Balay #endif
783f17f7ee2SSatish Balay } else {
784f17f7ee2SSatish Balay iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
785f17f7ee2SSatish Balay n = a->hmatrix->u_basis_tree.index_map.size();
786f17f7ee2SSatish Balay }
787f17f7ee2SSatish Balay
788f17f7ee2SSatish Balay if (PetscDefined(USE_64BIT_INDICES)) {
789f17f7ee2SSatish Balay PetscInt i;
790f17f7ee2SSatish Balay
791f17f7ee2SSatish Balay own = PETSC_OWN_POINTER;
792f17f7ee2SSatish Balay PetscCall(PetscMalloc1(n, &idx));
793f17f7ee2SSatish Balay for (i = 0; i < n; i++) idx[i] = iidx[i];
794f17f7ee2SSatish Balay } else {
795f17f7ee2SSatish Balay own = PETSC_COPY_VALUES;
796f17f7ee2SSatish Balay idx = (PetscInt *)iidx;
797f17f7ee2SSatish Balay }
798f17f7ee2SSatish Balay PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
799f17f7ee2SSatish Balay PetscCall(ISSetPermutation(is));
800f17f7ee2SSatish Balay PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
801f17f7ee2SSatish Balay a->h2opus_indexmap = is;
802f17f7ee2SSatish Balay }
803f17f7ee2SSatish Balay PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
804f17f7ee2SSatish Balay PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
805f17f7ee2SSatish Balay rid = (PetscBool)(n == A->rmap->n);
8065440e5dcSBarry Smith PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPI_C_BOOL, MPI_LAND, comm));
80748a46eb9SPierre Jolivet if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
808f17f7ee2SSatish Balay if (!rid) {
809f17f7ee2SSatish Balay if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
810f17f7ee2SSatish Balay PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
811f17f7ee2SSatish Balay PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
812f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
813f17f7ee2SSatish Balay PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
814f17f7ee2SSatish Balay }
815f17f7ee2SSatish Balay PetscCall(PetscSFCreate(comm, &a->sf));
816f17f7ee2SSatish Balay PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
817036e4bdcSSatish Balay PetscCall(PetscSFSetUp(a->sf));
818f17f7ee2SSatish Balay PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
819f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
820f17f7ee2SSatish Balay a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
821f17f7ee2SSatish Balay a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
822f17f7ee2SSatish Balay a->xxs_gpu = 1;
823f17f7ee2SSatish Balay a->yys_gpu = 1;
824f17f7ee2SSatish Balay #endif
825f17f7ee2SSatish Balay a->xx = new thrust::host_vector<PetscScalar>(n);
826f17f7ee2SSatish Balay a->yy = new thrust::host_vector<PetscScalar>(n);
827f17f7ee2SSatish Balay a->xxs = 1;
828f17f7ee2SSatish Balay a->yys = 1;
829f17f7ee2SSatish Balay }
830f17f7ee2SSatish Balay PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
831f17f7ee2SSatish Balay }
832f17f7ee2SSatish Balay a->multsetup = PETSC_TRUE;
8333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
834f17f7ee2SSatish Balay }
835f17f7ee2SSatish Balay
MatAssemblyEnd_H2OPUS(Mat A,MatAssemblyType assemblytype)836d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
837d71ae5a4SJacob Faibussowitsch {
838f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
839f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
840f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle;
841f17f7ee2SSatish Balay #else
842f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle;
843f17f7ee2SSatish Balay #endif
844f17f7ee2SSatish Balay PetscBool kernel = PETSC_FALSE;
845f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE;
846f17f7ee2SSatish Balay PetscBool samplingdone = PETSC_FALSE;
847f17f7ee2SSatish Balay MPI_Comm comm;
848f17f7ee2SSatish Balay PetscMPIInt size;
849f17f7ee2SSatish Balay
850f17f7ee2SSatish Balay PetscFunctionBegin;
851f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
852036e4bdcSSatish Balay PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
853036e4bdcSSatish Balay PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
854f17f7ee2SSatish Balay
855f17f7ee2SSatish Balay /* XXX */
856f17f7ee2SSatish Balay a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
857f17f7ee2SSatish Balay
858f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm, &size));
859f17f7ee2SSatish Balay /* TODO REUSABILITY of geometric construction */
860f17f7ee2SSatish Balay delete a->hmatrix;
861f17f7ee2SSatish Balay delete a->dist_hmatrix;
862f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
863f17f7ee2SSatish Balay delete a->hmatrix_gpu;
864f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu;
865f17f7ee2SSatish Balay #endif
866f17f7ee2SSatish Balay a->orthogonal = PETSC_FALSE;
867f17f7ee2SSatish Balay
868f17f7ee2SSatish Balay /* TODO: other? */
869f17f7ee2SSatish Balay H2OpusBoxCenterAdmissibility adm(a->eta);
870f17f7ee2SSatish Balay
871f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
872f17f7ee2SSatish Balay if (size > 1) {
873f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
874f17f7ee2SSatish Balay a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
875f17f7ee2SSatish Balay #else
876f17f7ee2SSatish Balay a->dist_hmatrix = NULL;
877f17f7ee2SSatish Balay #endif
878b94d7dedSBarry Smith } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
879f17f7ee2SSatish Balay PetscCall(MatH2OpusInferCoordinates_Private(A));
880f17f7ee2SSatish Balay PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
881f17f7ee2SSatish Balay if (a->kernel) {
882f17f7ee2SSatish Balay BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
883f17f7ee2SSatish Balay if (size > 1) {
884f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
885f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
886f17f7ee2SSatish Balay buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
887f17f7ee2SSatish Balay #endif
888f17f7ee2SSatish Balay } else {
889f17f7ee2SSatish Balay buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
890f17f7ee2SSatish Balay }
891f17f7ee2SSatish Balay kernel = PETSC_TRUE;
892f17f7ee2SSatish Balay } else {
893036e4bdcSSatish Balay PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
894f17f7ee2SSatish Balay buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
895f17f7ee2SSatish Balay }
896f17f7ee2SSatish Balay PetscCall(MatSetUpMultiply_H2OPUS(A));
897f17f7ee2SSatish Balay
898f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
899f17f7ee2SSatish Balay boundtocpu = A->boundtocpu;
900f17f7ee2SSatish Balay if (!boundtocpu) {
901f17f7ee2SSatish Balay if (size > 1) {
902f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
903f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
904f17f7ee2SSatish Balay a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
905f17f7ee2SSatish Balay #endif
906f17f7ee2SSatish Balay } else {
907f17f7ee2SSatish Balay a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
908f17f7ee2SSatish Balay }
909f17f7ee2SSatish Balay }
910f17f7ee2SSatish Balay #endif
911f17f7ee2SSatish Balay if (size == 1) {
912f17f7ee2SSatish Balay if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
913f17f7ee2SSatish Balay PetscReal Anorm;
914f17f7ee2SSatish Balay bool verbose;
915f17f7ee2SSatish Balay
916f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
917f17f7ee2SSatish Balay verbose = a->hara_verbose;
918f17f7ee2SSatish Balay PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
919f17f7ee2SSatish Balay if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
920ad540459SPierre Jolivet if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
921f17f7ee2SSatish Balay a->sampler->SetStream(handle->getMainStream());
922f17f7ee2SSatish Balay if (boundtocpu) {
923f17f7ee2SSatish Balay a->sampler->SetGPUSampling(false);
924f17f7ee2SSatish Balay hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
925f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
926f17f7ee2SSatish Balay } else {
927f17f7ee2SSatish Balay a->sampler->SetGPUSampling(true);
928f17f7ee2SSatish Balay hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
929f17f7ee2SSatish Balay #endif
930f17f7ee2SSatish Balay }
931f17f7ee2SSatish Balay samplingdone = PETSC_TRUE;
932f17f7ee2SSatish Balay }
933f17f7ee2SSatish Balay }
934f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
935f17f7ee2SSatish Balay if (!boundtocpu) {
936f17f7ee2SSatish Balay delete a->hmatrix;
937f17f7ee2SSatish Balay delete a->dist_hmatrix;
938f17f7ee2SSatish Balay a->hmatrix = NULL;
939f17f7ee2SSatish Balay a->dist_hmatrix = NULL;
940f17f7ee2SSatish Balay }
941f17f7ee2SSatish Balay A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
942f17f7ee2SSatish Balay #endif
943f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
944f17f7ee2SSatish Balay
945f17f7ee2SSatish Balay if (!a->s) a->s = 1.0;
946f17f7ee2SSatish Balay A->assembled = PETSC_TRUE;
947f17f7ee2SSatish Balay
948f17f7ee2SSatish Balay if (samplingdone) {
949f17f7ee2SSatish Balay PetscBool check = a->check_construction;
950f17f7ee2SSatish Balay PetscBool checke = PETSC_FALSE;
951f17f7ee2SSatish Balay
952f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
953f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
954f17f7ee2SSatish Balay if (check) {
955f17f7ee2SSatish Balay Mat E, Ae;
956f17f7ee2SSatish Balay PetscReal n1, ni, n2;
957f17f7ee2SSatish Balay PetscReal n1A, niA, n2A;
95857d50842SBarry Smith PetscErrorCodeFn *normfunc;
959f17f7ee2SSatish Balay
960f17f7ee2SSatish Balay Ae = a->sampler->GetSamplingMat();
961f17f7ee2SSatish Balay PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
96257d50842SBarry Smith PetscCall(MatShellSetOperation(E, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
963f17f7ee2SSatish Balay PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
964f17f7ee2SSatish Balay PetscCall(MatNorm(E, NORM_1, &n1));
965f17f7ee2SSatish Balay PetscCall(MatNorm(E, NORM_INFINITY, &ni));
966f17f7ee2SSatish Balay PetscCall(MatNorm(E, NORM_2, &n2));
967f17f7ee2SSatish Balay if (checke) {
968f17f7ee2SSatish Balay Mat eA, eE, eAe;
969f17f7ee2SSatish Balay
970f17f7ee2SSatish Balay PetscCall(MatComputeOperator(A, MATAIJ, &eA));
971f17f7ee2SSatish Balay PetscCall(MatComputeOperator(E, MATAIJ, &eE));
972f17f7ee2SSatish Balay PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
9732ce66baaSPierre Jolivet PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
9742ce66baaSPierre Jolivet PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
9752ce66baaSPierre Jolivet PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
976f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
977f17f7ee2SSatish Balay PetscCall(MatView(eA, NULL));
978f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
979f17f7ee2SSatish Balay PetscCall(MatView(eAe, NULL));
980f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
981f17f7ee2SSatish Balay PetscCall(MatView(eE, NULL));
982f17f7ee2SSatish Balay PetscCall(MatDestroy(&eA));
983f17f7ee2SSatish Balay PetscCall(MatDestroy(&eE));
984f17f7ee2SSatish Balay PetscCall(MatDestroy(&eAe));
985f17f7ee2SSatish Balay }
986f17f7ee2SSatish Balay
987f17f7ee2SSatish Balay PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
98857d50842SBarry Smith PetscCall(MatSetOperation(Ae, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
989f17f7ee2SSatish Balay PetscCall(MatNorm(Ae, NORM_1, &n1A));
990f17f7ee2SSatish Balay PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
991f17f7ee2SSatish Balay PetscCall(MatNorm(Ae, NORM_2, &n2A));
992f17f7ee2SSatish Balay n1A = PetscMax(n1A, PETSC_SMALL);
993f17f7ee2SSatish Balay n2A = PetscMax(n2A, PETSC_SMALL);
994f17f7ee2SSatish Balay niA = PetscMax(niA, PETSC_SMALL);
995f17f7ee2SSatish Balay PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
996f17f7ee2SSatish Balay PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
997f17f7ee2SSatish Balay PetscCall(MatDestroy(&E));
998f17f7ee2SSatish Balay }
999f17f7ee2SSatish Balay a->sampler->SetSamplingMat(NULL);
1000f17f7ee2SSatish Balay }
10013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1002f17f7ee2SSatish Balay }
1003f17f7ee2SSatish Balay
MatZeroEntries_H2OPUS(Mat A)1004d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1005d71ae5a4SJacob Faibussowitsch {
1006f17f7ee2SSatish Balay PetscMPIInt size;
1007f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1008f17f7ee2SSatish Balay
1009f17f7ee2SSatish Balay PetscFunctionBegin;
1010f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1011036e4bdcSSatish Balay PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1012f17f7ee2SSatish Balay a->hmatrix->clearData();
1013f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1014f17f7ee2SSatish Balay if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1015f17f7ee2SSatish Balay #endif
10163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1017f17f7ee2SSatish Balay }
1018f17f7ee2SSatish Balay
MatDuplicate_H2OPUS(Mat B,MatDuplicateOption op,Mat * nA)1019d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1020d71ae5a4SJacob Faibussowitsch {
1021f17f7ee2SSatish Balay Mat A;
1022f17f7ee2SSatish Balay Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1023*fc2fb351SPierre Jolivet PetscBool iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1024f17f7ee2SSatish Balay MPI_Comm comm;
1025f17f7ee2SSatish Balay
1026f17f7ee2SSatish Balay PetscFunctionBegin;
1027f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1028f17f7ee2SSatish Balay PetscCall(MatCreate(comm, &A));
1029f17f7ee2SSatish Balay PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1030f17f7ee2SSatish Balay PetscCall(MatSetType(A, MATH2OPUS));
1031f17f7ee2SSatish Balay PetscCall(MatPropagateSymmetryOptions(B, A));
1032f17f7ee2SSatish Balay a = (Mat_H2OPUS *)A->data;
1033f17f7ee2SSatish Balay
1034f17f7ee2SSatish Balay a->eta = b->eta;
1035f17f7ee2SSatish Balay a->leafsize = b->leafsize;
1036f17f7ee2SSatish Balay a->basisord = b->basisord;
1037f17f7ee2SSatish Balay a->max_rank = b->max_rank;
1038f17f7ee2SSatish Balay a->bs = b->bs;
1039f17f7ee2SSatish Balay a->rtol = b->rtol;
1040f17f7ee2SSatish Balay a->norm_max_samples = b->norm_max_samples;
1041f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES) a->s = b->s;
1042f17f7ee2SSatish Balay
1043f17f7ee2SSatish Balay a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1044f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1045f17f7ee2SSatish Balay
1046f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1047ad540459SPierre Jolivet if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1048f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1049ad540459SPierre Jolivet if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1050f17f7ee2SSatish Balay #endif
1051f17f7ee2SSatish Balay #endif
1052f17f7ee2SSatish Balay if (b->hmatrix) {
1053f17f7ee2SSatish Balay a->hmatrix = new HMatrix(*b->hmatrix);
1054f17f7ee2SSatish Balay if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1055f17f7ee2SSatish Balay }
1056f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1057f17f7ee2SSatish Balay if (b->hmatrix_gpu) {
1058f17f7ee2SSatish Balay a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1059f17f7ee2SSatish Balay if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1060f17f7ee2SSatish Balay }
1061f17f7ee2SSatish Balay #endif
1062f17f7ee2SSatish Balay if (b->sf) {
1063f17f7ee2SSatish Balay PetscCall(PetscObjectReference((PetscObject)b->sf));
1064f17f7ee2SSatish Balay a->sf = b->sf;
1065f17f7ee2SSatish Balay }
1066f17f7ee2SSatish Balay if (b->h2opus_indexmap) {
1067f17f7ee2SSatish Balay PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1068f17f7ee2SSatish Balay a->h2opus_indexmap = b->h2opus_indexmap;
1069f17f7ee2SSatish Balay }
1070f17f7ee2SSatish Balay
1071f17f7ee2SSatish Balay PetscCall(MatSetUp(A));
1072f17f7ee2SSatish Balay PetscCall(MatSetUpMultiply_H2OPUS(A));
1073f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES) {
1074f17f7ee2SSatish Balay A->assembled = PETSC_TRUE;
1075f17f7ee2SSatish Balay a->orthogonal = b->orthogonal;
1076f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1077f17f7ee2SSatish Balay A->offloadmask = B->offloadmask;
1078f17f7ee2SSatish Balay #endif
1079f17f7ee2SSatish Balay }
1080f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1081f17f7ee2SSatish Balay iscpu = B->boundtocpu;
1082f17f7ee2SSatish Balay #endif
1083f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A, iscpu));
1084f17f7ee2SSatish Balay
1085f17f7ee2SSatish Balay *nA = A;
10863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1087f17f7ee2SSatish Balay }
1088f17f7ee2SSatish Balay
MatView_H2OPUS(Mat A,PetscViewer view)1089d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1090d71ae5a4SJacob Faibussowitsch {
1091f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1092036e4bdcSSatish Balay PetscBool isascii, vieweps;
1093f17f7ee2SSatish Balay PetscMPIInt size;
1094f17f7ee2SSatish Balay PetscViewerFormat format;
1095f17f7ee2SSatish Balay
1096f17f7ee2SSatish Balay PetscFunctionBegin;
1097f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1098f17f7ee2SSatish Balay PetscCall(PetscViewerGetFormat(view, &format));
1099f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1100f17f7ee2SSatish Balay if (isascii) {
1101f17f7ee2SSatish Balay if (format == PETSC_VIEWER_ASCII_MATLAB) {
1102f17f7ee2SSatish Balay if (size == 1) {
1103f17f7ee2SSatish Balay FILE *fp;
1104f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1105f17f7ee2SSatish Balay dumpHMatrix(*h2opus->hmatrix, 6, fp);
1106f17f7ee2SSatish Balay }
1107f17f7ee2SSatish Balay } else {
1108f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1109f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1110f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1111f17f7ee2SSatish Balay if (!h2opus->kernel) {
1112f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1113f17f7ee2SSatish Balay } else {
11144cf0e950SBarry Smith PetscCall(PetscViewerASCIIPrintf(view, " Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1115f17f7ee2SSatish Balay }
1116f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1117f17f7ee2SSatish Balay if (size == 1) {
1118f17f7ee2SSatish Balay double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1119f17f7ee2SSatish Balay double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1120f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1121f17f7ee2SSatish Balay double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1122f17f7ee2SSatish Balay double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1123f17f7ee2SSatish Balay #endif
1124f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1125f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1126f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1127f17f7ee2SSatish Balay #endif
1128f17f7ee2SSatish Balay } else {
1129f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1130f17f7ee2SSatish Balay double matrix_mem[4] = {0., 0., 0., 0.};
1131f17f7ee2SSatish Balay PetscMPIInt rsize = 4;
1132f17f7ee2SSatish Balay #else
1133f17f7ee2SSatish Balay double matrix_mem[2] = {0., 0.};
1134f17f7ee2SSatish Balay PetscMPIInt rsize = 2;
1135f17f7ee2SSatish Balay #endif
1136f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1137f17f7ee2SSatish Balay matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1138f17f7ee2SSatish Balay matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1139f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1140f17f7ee2SSatish Balay matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1141f17f7ee2SSatish Balay matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1142f17f7ee2SSatish Balay #endif
1143f17f7ee2SSatish Balay #endif
1144462c564dSBarry Smith PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1145f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1146f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1147f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1148f17f7ee2SSatish Balay #endif
1149f17f7ee2SSatish Balay }
1150f17f7ee2SSatish Balay }
1151f17f7ee2SSatish Balay }
1152036e4bdcSSatish Balay vieweps = PETSC_FALSE;
1153036e4bdcSSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1154036e4bdcSSatish Balay if (vieweps) {
1155f17f7ee2SSatish Balay char filename[256];
1156f17f7ee2SSatish Balay const char *name;
1157f17f7ee2SSatish Balay
1158f17f7ee2SSatish Balay PetscCall(PetscObjectGetName((PetscObject)A, &name));
1159f17f7ee2SSatish Balay PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1160036e4bdcSSatish Balay PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1161f17f7ee2SSatish Balay outputEps(*h2opus->hmatrix, filename);
1162f17f7ee2SSatish Balay }
11633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1164f17f7ee2SSatish Balay }
1165f17f7ee2SSatish Balay
MatH2OpusSetCoords_H2OPUS(Mat A,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,MatH2OpusKernelFn * kernel,void * kernelctx)11668434afd1SBarry Smith static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx)
1167d71ae5a4SJacob Faibussowitsch {
1168f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1169f17f7ee2SSatish Balay PetscReal *gcoords;
1170f17f7ee2SSatish Balay PetscInt N;
1171f17f7ee2SSatish Balay MPI_Comm comm;
1172f17f7ee2SSatish Balay PetscMPIInt size;
1173f17f7ee2SSatish Balay PetscBool cong;
1174f17f7ee2SSatish Balay
1175f17f7ee2SSatish Balay PetscFunctionBegin;
1176f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(A->rmap));
1177f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(A->cmap));
1178f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1179f17f7ee2SSatish Balay PetscCall(MatHasCongruentLayouts(A, &cong));
1180f17f7ee2SSatish Balay PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1181f17f7ee2SSatish Balay N = A->rmap->N;
1182f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm, &size));
1183f17f7ee2SSatish Balay if (spacedim > 0 && size > 1 && cdist) {
1184f17f7ee2SSatish Balay PetscSF sf;
1185f17f7ee2SSatish Balay MPI_Datatype dtype;
1186f17f7ee2SSatish Balay
1187f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1188f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_commit(&dtype));
1189f17f7ee2SSatish Balay
1190f17f7ee2SSatish Balay PetscCall(PetscSFCreate(comm, &sf));
1191f17f7ee2SSatish Balay PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1192f17f7ee2SSatish Balay PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1193f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1194f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1195f17f7ee2SSatish Balay PetscCall(PetscSFDestroy(&sf));
1196f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_free(&dtype));
1197f17f7ee2SSatish Balay } else gcoords = (PetscReal *)coords;
1198f17f7ee2SSatish Balay
1199f17f7ee2SSatish Balay delete h2opus->ptcloud;
1200f17f7ee2SSatish Balay delete h2opus->kernel;
1201f17f7ee2SSatish Balay h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1202f17f7ee2SSatish Balay if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1203f17f7ee2SSatish Balay if (gcoords != coords) PetscCall(PetscFree(gcoords));
1204f17f7ee2SSatish Balay A->preallocated = PETSC_TRUE;
12053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1206f17f7ee2SSatish Balay }
1207f17f7ee2SSatish Balay
1208f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
MatBindToCPU_H2OPUS(Mat A,PetscBool flg)1209d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1210d71ae5a4SJacob Faibussowitsch {
1211f17f7ee2SSatish Balay PetscMPIInt size;
1212f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1213f17f7ee2SSatish Balay
1214f17f7ee2SSatish Balay PetscFunctionBegin;
1215f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1216f17f7ee2SSatish Balay if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1217f17f7ee2SSatish Balay if (size > 1) {
1218f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1219f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1220f17f7ee2SSatish Balay if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1221f17f7ee2SSatish Balay else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1222f17f7ee2SSatish Balay #endif
1223f17f7ee2SSatish Balay } else {
1224f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1225f17f7ee2SSatish Balay if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1226f17f7ee2SSatish Balay else *a->hmatrix = *a->hmatrix_gpu;
1227f17f7ee2SSatish Balay }
1228f17f7ee2SSatish Balay delete a->hmatrix_gpu;
1229f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu;
1230f17f7ee2SSatish Balay a->hmatrix_gpu = NULL;
1231f17f7ee2SSatish Balay a->dist_hmatrix_gpu = NULL;
1232f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU;
1233f17f7ee2SSatish Balay } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1234f17f7ee2SSatish Balay if (size > 1) {
1235f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1236f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1237f17f7ee2SSatish Balay if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1238f17f7ee2SSatish Balay else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1239f17f7ee2SSatish Balay #endif
1240f17f7ee2SSatish Balay } else {
1241f17f7ee2SSatish Balay PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1242f17f7ee2SSatish Balay if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1243f17f7ee2SSatish Balay else *a->hmatrix_gpu = *a->hmatrix;
1244f17f7ee2SSatish Balay }
1245f17f7ee2SSatish Balay delete a->hmatrix;
1246f17f7ee2SSatish Balay delete a->dist_hmatrix;
1247f17f7ee2SSatish Balay a->hmatrix = NULL;
1248f17f7ee2SSatish Balay a->dist_hmatrix = NULL;
1249f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_GPU;
1250f17f7ee2SSatish Balay }
1251f17f7ee2SSatish Balay PetscCall(PetscFree(A->defaultvectype));
1252f17f7ee2SSatish Balay if (!flg) {
1253f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1254f17f7ee2SSatish Balay } else {
1255f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1256f17f7ee2SSatish Balay }
1257f17f7ee2SSatish Balay A->boundtocpu = flg;
12583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1259f17f7ee2SSatish Balay }
1260f17f7ee2SSatish Balay #endif
1261f17f7ee2SSatish Balay
1262f17f7ee2SSatish Balay /*MC
12631d27aa22SBarry Smith MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.
1264f17f7ee2SSatish Balay
12652ef1f0ffSBarry Smith Options Database Key:
12662ef1f0ffSBarry Smith . -mat_type h2opus - matrix type to "h2opus"
12672ef1f0ffSBarry Smith
12682ef1f0ffSBarry Smith Level: beginner
1269f17f7ee2SSatish Balay
1270f17f7ee2SSatish Balay Notes:
12711d27aa22SBarry Smith H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.
127211a5261eSBarry Smith
12732ef1f0ffSBarry Smith For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
12742ef1f0ffSBarry Smith In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.
1275f17f7ee2SSatish Balay
12761cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1277f17f7ee2SSatish Balay M*/
MatCreate_H2OPUS(Mat A)1278d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1279d71ae5a4SJacob Faibussowitsch {
1280f17f7ee2SSatish Balay Mat_H2OPUS *a;
1281f17f7ee2SSatish Balay PetscMPIInt size;
1282f17f7ee2SSatish Balay
1283f17f7ee2SSatish Balay PetscFunctionBegin;
1284f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1285f17f7ee2SSatish Balay PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1286f17f7ee2SSatish Balay #endif
12874dfa11a4SJacob Faibussowitsch PetscCall(PetscNew(&a));
1288f17f7ee2SSatish Balay A->data = (void *)a;
1289f17f7ee2SSatish Balay
1290f17f7ee2SSatish Balay a->eta = 0.9;
1291f17f7ee2SSatish Balay a->leafsize = 32;
1292f17f7ee2SSatish Balay a->basisord = 4;
1293f17f7ee2SSatish Balay a->max_rank = 64;
1294f17f7ee2SSatish Balay a->bs = 32;
1295f17f7ee2SSatish Balay a->rtol = 1.e-4;
1296f17f7ee2SSatish Balay a->s = 1.0;
1297f17f7ee2SSatish Balay a->norm_max_samples = 10;
1298036e4bdcSSatish Balay a->resize = PETSC_TRUE; /* reallocate after compression */
1299f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1300f17f7ee2SSatish Balay h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1301f17f7ee2SSatish Balay #else
1302f17f7ee2SSatish Balay h2opusCreateHandle(&a->handle);
1303f17f7ee2SSatish Balay #endif
1304f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1305f17f7ee2SSatish Balay PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1306f17f7ee2SSatish Balay PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1307f17f7ee2SSatish Balay
1308f17f7ee2SSatish Balay A->ops->destroy = MatDestroy_H2OPUS;
1309f17f7ee2SSatish Balay A->ops->view = MatView_H2OPUS;
1310f17f7ee2SSatish Balay A->ops->assemblyend = MatAssemblyEnd_H2OPUS;
1311f17f7ee2SSatish Balay A->ops->mult = MatMult_H2OPUS;
1312f17f7ee2SSatish Balay A->ops->multtranspose = MatMultTranspose_H2OPUS;
1313f17f7ee2SSatish Balay A->ops->multadd = MatMultAdd_H2OPUS;
1314f17f7ee2SSatish Balay A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1315f17f7ee2SSatish Balay A->ops->scale = MatScale_H2OPUS;
1316f17f7ee2SSatish Balay A->ops->duplicate = MatDuplicate_H2OPUS;
1317f17f7ee2SSatish Balay A->ops->setfromoptions = MatSetFromOptions_H2OPUS;
1318f17f7ee2SSatish Balay A->ops->norm = MatNorm_H2OPUS;
1319f17f7ee2SSatish Balay A->ops->zeroentries = MatZeroEntries_H2OPUS;
1320f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1321f17f7ee2SSatish Balay A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1322f17f7ee2SSatish Balay #endif
1323f17f7ee2SSatish Balay
1324f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1325f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1326f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1327f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1328f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1329f17f7ee2SSatish Balay PetscCall(PetscFree(A->defaultvectype));
1330f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1331f17f7ee2SSatish Balay #endif
13323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1333f17f7ee2SSatish Balay }
1334f17f7ee2SSatish Balay
1335cc4c1da9SBarry Smith /*@
1336f17f7ee2SSatish Balay MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1337f17f7ee2SSatish Balay
1338f17f7ee2SSatish Balay Input Parameter:
1339f17f7ee2SSatish Balay . A - the matrix
1340f17f7ee2SSatish Balay
1341f17f7ee2SSatish Balay Level: intermediate
1342f17f7ee2SSatish Balay
13431cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1344f17f7ee2SSatish Balay @*/
MatH2OpusOrthogonalize(Mat A)1345d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1346d71ae5a4SJacob Faibussowitsch {
1347f17f7ee2SSatish Balay PetscBool ish2opus;
1348f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1349f17f7ee2SSatish Balay PetscMPIInt size;
1350f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE;
1351f17f7ee2SSatish Balay
1352f17f7ee2SSatish Balay PetscFunctionBegin;
1353f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1354f17f7ee2SSatish Balay PetscValidType(A, 1);
1355f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
13563ba16761SJacob Faibussowitsch if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
13573ba16761SJacob Faibussowitsch if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1358f17f7ee2SSatish Balay HLibProfile::clear();
1359f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1360f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1361f17f7ee2SSatish Balay boundtocpu = A->boundtocpu;
1362f17f7ee2SSatish Balay #endif
1363f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1364f17f7ee2SSatish Balay if (size > 1) {
1365f17f7ee2SSatish Balay if (boundtocpu) {
1366f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1367f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1368f17f7ee2SSatish Balay distributed_horthog(*a->dist_hmatrix, a->handle);
1369f17f7ee2SSatish Balay #endif
1370f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1371f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU;
1372f17f7ee2SSatish Balay } else {
1373f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1374f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin());
1375f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1376f17f7ee2SSatish Balay distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1377f17f7ee2SSatish Balay #endif
1378f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd());
1379f17f7ee2SSatish Balay #endif
1380f17f7ee2SSatish Balay }
1381f17f7ee2SSatish Balay } else {
1382f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1383f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle;
1384f17f7ee2SSatish Balay #else
1385f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle;
1386f17f7ee2SSatish Balay #endif
1387f17f7ee2SSatish Balay if (boundtocpu) {
1388f17f7ee2SSatish Balay PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1389f17f7ee2SSatish Balay horthog(*a->hmatrix, handle);
1390f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1391f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU;
1392f17f7ee2SSatish Balay } else {
1393f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1394f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin());
1395f17f7ee2SSatish Balay horthog(*a->hmatrix_gpu, handle);
1396f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd());
1397f17f7ee2SSatish Balay #endif
1398f17f7ee2SSatish Balay }
1399f17f7ee2SSatish Balay }
1400f17f7ee2SSatish Balay a->orthogonal = PETSC_TRUE;
1401f17f7ee2SSatish Balay { /* log flops */
1402f17f7ee2SSatish Balay double gops, time, perf, dev;
1403f17f7ee2SSatish Balay HLibProfile::getHorthogPerf(gops, time, perf, dev);
1404f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1405f17f7ee2SSatish Balay if (boundtocpu) {
1406f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops));
1407f17f7ee2SSatish Balay } else {
1408f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9 * gops));
1409f17f7ee2SSatish Balay }
1410f17f7ee2SSatish Balay #else
1411f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops));
1412f17f7ee2SSatish Balay #endif
1413f17f7ee2SSatish Balay }
1414f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
14153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1416f17f7ee2SSatish Balay }
1417f17f7ee2SSatish Balay
1418cc4c1da9SBarry Smith /*@
1419f17f7ee2SSatish Balay MatH2OpusCompress - Compress a hierarchical matrix.
1420f17f7ee2SSatish Balay
1421f17f7ee2SSatish Balay Input Parameters:
1422f17f7ee2SSatish Balay + A - the matrix
1423f17f7ee2SSatish Balay - tol - the absolute truncation threshold
1424f17f7ee2SSatish Balay
1425f17f7ee2SSatish Balay Level: intermediate
1426f17f7ee2SSatish Balay
14271cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1428f17f7ee2SSatish Balay @*/
MatH2OpusCompress(Mat A,PetscReal tol)1429d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1430d71ae5a4SJacob Faibussowitsch {
1431f17f7ee2SSatish Balay PetscBool ish2opus;
1432f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1433f17f7ee2SSatish Balay PetscMPIInt size;
1434f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE;
1435f17f7ee2SSatish Balay
1436f17f7ee2SSatish Balay PetscFunctionBegin;
1437f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1438f17f7ee2SSatish Balay PetscValidType(A, 1);
1439f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(A, tol, 2);
1440f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
14413ba16761SJacob Faibussowitsch if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1442f17f7ee2SSatish Balay PetscCall(MatH2OpusOrthogonalize(A));
1443f17f7ee2SSatish Balay HLibProfile::clear();
1444f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1445f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1446f17f7ee2SSatish Balay boundtocpu = A->boundtocpu;
1447f17f7ee2SSatish Balay #endif
1448f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1449f17f7ee2SSatish Balay if (size > 1) {
1450f17f7ee2SSatish Balay if (boundtocpu) {
1451f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1452f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1453f17f7ee2SSatish Balay distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1454036e4bdcSSatish Balay if (a->resize) {
1455036e4bdcSSatish Balay DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1456036e4bdcSSatish Balay delete a->dist_hmatrix;
1457036e4bdcSSatish Balay a->dist_hmatrix = dist_hmatrix;
1458036e4bdcSSatish Balay }
1459f17f7ee2SSatish Balay #endif
1460f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1461f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU;
1462f17f7ee2SSatish Balay } else {
1463f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1464f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin());
1465f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1466f17f7ee2SSatish Balay distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1467036e4bdcSSatish Balay
1468036e4bdcSSatish Balay if (a->resize) {
1469036e4bdcSSatish Balay DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1470036e4bdcSSatish Balay delete a->dist_hmatrix_gpu;
1471036e4bdcSSatish Balay a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1472036e4bdcSSatish Balay }
1473f17f7ee2SSatish Balay #endif
1474f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd());
1475f17f7ee2SSatish Balay #endif
1476f17f7ee2SSatish Balay }
1477f17f7ee2SSatish Balay } else {
1478f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1479f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle;
1480f17f7ee2SSatish Balay #else
1481f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle;
1482f17f7ee2SSatish Balay #endif
1483f17f7ee2SSatish Balay if (boundtocpu) {
1484f17f7ee2SSatish Balay PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1485f17f7ee2SSatish Balay hcompress(*a->hmatrix, tol, handle);
1486036e4bdcSSatish Balay
1487036e4bdcSSatish Balay if (a->resize) {
1488036e4bdcSSatish Balay HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1489036e4bdcSSatish Balay delete a->hmatrix;
1490036e4bdcSSatish Balay a->hmatrix = hmatrix;
1491036e4bdcSSatish Balay }
1492f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1493f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU;
1494f17f7ee2SSatish Balay } else {
1495f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1496f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin());
1497f17f7ee2SSatish Balay hcompress(*a->hmatrix_gpu, tol, handle);
1498f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd());
1499036e4bdcSSatish Balay
1500036e4bdcSSatish Balay if (a->resize) {
1501036e4bdcSSatish Balay HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1502036e4bdcSSatish Balay delete a->hmatrix_gpu;
1503036e4bdcSSatish Balay a->hmatrix_gpu = hmatrix_gpu;
1504036e4bdcSSatish Balay }
1505f17f7ee2SSatish Balay #endif
1506f17f7ee2SSatish Balay }
1507f17f7ee2SSatish Balay }
1508f17f7ee2SSatish Balay { /* log flops */
1509f17f7ee2SSatish Balay double gops, time, perf, dev;
1510f17f7ee2SSatish Balay HLibProfile::getHcompressPerf(gops, time, perf, dev);
1511f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1512f17f7ee2SSatish Balay if (boundtocpu) {
1513f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops));
1514f17f7ee2SSatish Balay } else {
1515f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9 * gops));
1516f17f7ee2SSatish Balay }
1517f17f7ee2SSatish Balay #else
1518f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops));
1519f17f7ee2SSatish Balay #endif
1520f17f7ee2SSatish Balay }
1521f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
15223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1523f17f7ee2SSatish Balay }
1524f17f7ee2SSatish Balay
1525cc4c1da9SBarry Smith /*@
15262ef1f0ffSBarry Smith MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.
1527f17f7ee2SSatish Balay
1528f17f7ee2SSatish Balay Input Parameters:
1529f17f7ee2SSatish Balay + A - the hierarchical matrix
1530f17f7ee2SSatish Balay . B - the matrix to be sampled
1531f17f7ee2SSatish Balay . bs - maximum number of samples to be taken concurrently
1532f17f7ee2SSatish Balay - tol - relative tolerance for construction
1533f17f7ee2SSatish Balay
15341d27aa22SBarry Smith Level: intermediate
15351d27aa22SBarry Smith
153611a5261eSBarry Smith Notes:
153711a5261eSBarry Smith You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1538f17f7ee2SSatish Balay
15391cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1540f17f7ee2SSatish Balay @*/
MatH2OpusSetSamplingMat(Mat A,Mat B,PetscInt bs,PetscReal tol)1541d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1542d71ae5a4SJacob Faibussowitsch {
1543f17f7ee2SSatish Balay PetscBool ish2opus;
1544f17f7ee2SSatish Balay
1545f17f7ee2SSatish Balay PetscFunctionBegin;
1546f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1547f17f7ee2SSatish Balay PetscValidType(A, 1);
1548f17f7ee2SSatish Balay if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
1549f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(A, bs, 3);
155007e0ed13SBarry Smith PetscValidLogicalCollectiveReal(A, tol, 4);
1551f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1552f17f7ee2SSatish Balay if (ish2opus) {
1553f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1554f17f7ee2SSatish Balay
1555f17f7ee2SSatish Balay if (!a->sampler) a->sampler = new PetscMatrixSampler();
1556f17f7ee2SSatish Balay a->sampler->SetSamplingMat(B);
1557f17f7ee2SSatish Balay if (bs > 0) a->bs = bs;
1558f17f7ee2SSatish Balay if (tol > 0.) a->rtol = tol;
1559f17f7ee2SSatish Balay delete a->kernel;
1560f17f7ee2SSatish Balay }
15613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1562f17f7ee2SSatish Balay }
1563f17f7ee2SSatish Balay
1564f17f7ee2SSatish Balay /*@C
156511a5261eSBarry Smith MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1566f17f7ee2SSatish Balay
1567f17f7ee2SSatish Balay Input Parameters:
1568f17f7ee2SSatish Balay + comm - MPI communicator
15692ef1f0ffSBarry Smith . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
15702ef1f0ffSBarry Smith . n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
15712ef1f0ffSBarry Smith . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
15722ef1f0ffSBarry Smith . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1573f17f7ee2SSatish Balay . spacedim - dimension of the space coordinates
1574f17f7ee2SSatish Balay . coords - coordinates of the points
1575f17f7ee2SSatish Balay . cdist - whether or not coordinates are distributed
15762ef1f0ffSBarry Smith . kernel - computational kernel (or `NULL`)
1577f17f7ee2SSatish Balay . kernelctx - kernel context
1578f17f7ee2SSatish Balay . eta - admissibility condition tolerance
1579f17f7ee2SSatish Balay . leafsize - leaf size in cluster tree
1580f17f7ee2SSatish Balay - basisord - approximation order for Chebychev interpolation of low-rank blocks
1581f17f7ee2SSatish Balay
1582f17f7ee2SSatish Balay Output Parameter:
1583f17f7ee2SSatish Balay . nA - matrix
1584f17f7ee2SSatish Balay
1585f17f7ee2SSatish Balay Options Database Keys:
158611a5261eSBarry Smith + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
158711a5261eSBarry Smith . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
158811a5261eSBarry Smith . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
158911a5261eSBarry Smith - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1590f17f7ee2SSatish Balay
1591f17f7ee2SSatish Balay Level: intermediate
1592f17f7ee2SSatish Balay
15931cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1594f17f7ee2SSatish Balay @*/
MatCreateH2OpusFromKernel(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,MatH2OpusKernelFn * kernel,void * kernelctx,PetscReal eta,PetscInt leafsize,PetscInt basisord,Mat * nA)15958434afd1SBarry Smith PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1596d71ae5a4SJacob Faibussowitsch {
1597f17f7ee2SSatish Balay Mat A;
1598f17f7ee2SSatish Balay Mat_H2OPUS *h2opus;
1599*fc2fb351SPierre Jolivet PetscBool iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1600f17f7ee2SSatish Balay
1601f17f7ee2SSatish Balay PetscFunctionBegin;
1602036e4bdcSSatish Balay PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1603f17f7ee2SSatish Balay PetscCall(MatCreate(comm, &A));
1604f17f7ee2SSatish Balay PetscCall(MatSetSizes(A, m, n, M, N));
1605036e4bdcSSatish Balay PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1606f17f7ee2SSatish Balay PetscCall(MatSetType(A, MATH2OPUS));
1607f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A, iscpu));
1608f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1609f17f7ee2SSatish Balay
1610f17f7ee2SSatish Balay h2opus = (Mat_H2OPUS *)A->data;
1611f17f7ee2SSatish Balay if (eta > 0.) h2opus->eta = eta;
1612f17f7ee2SSatish Balay if (leafsize > 0) h2opus->leafsize = leafsize;
1613f17f7ee2SSatish Balay if (basisord > 0) h2opus->basisord = basisord;
1614f17f7ee2SSatish Balay
1615f17f7ee2SSatish Balay *nA = A;
16163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1617f17f7ee2SSatish Balay }
1618f17f7ee2SSatish Balay
1619cc4c1da9SBarry Smith /*@
162011a5261eSBarry Smith MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1621f17f7ee2SSatish Balay
1622f17f7ee2SSatish Balay Input Parameters:
1623f17f7ee2SSatish Balay + B - the matrix to be sampled
1624f17f7ee2SSatish Balay . spacedim - dimension of the space coordinates
1625f17f7ee2SSatish Balay . coords - coordinates of the points
1626f17f7ee2SSatish Balay . cdist - whether or not coordinates are distributed
1627f17f7ee2SSatish Balay . eta - admissibility condition tolerance
1628f17f7ee2SSatish Balay . leafsize - leaf size in cluster tree
1629f17f7ee2SSatish Balay . maxrank - maximum rank allowed
1630f17f7ee2SSatish Balay . bs - maximum number of samples to be taken concurrently
1631f17f7ee2SSatish Balay - rtol - relative tolerance for construction
1632f17f7ee2SSatish Balay
1633f17f7ee2SSatish Balay Output Parameter:
1634f17f7ee2SSatish Balay . nA - matrix
1635f17f7ee2SSatish Balay
1636f17f7ee2SSatish Balay Options Database Keys:
163711a5261eSBarry Smith + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
163811a5261eSBarry Smith . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
163911a5261eSBarry Smith . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
164011a5261eSBarry Smith . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
164111a5261eSBarry Smith . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
164211a5261eSBarry Smith . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
164311a5261eSBarry Smith . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1644aaa8cc7dSPierre Jolivet - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms
1645f17f7ee2SSatish Balay
16462ef1f0ffSBarry Smith Level: intermediate
16472ef1f0ffSBarry Smith
164811a5261eSBarry Smith Note:
164911a5261eSBarry Smith Not available in parallel
1650f17f7ee2SSatish Balay
16511cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1652f17f7ee2SSatish Balay @*/
MatCreateH2OpusFromMat(Mat B,PetscInt spacedim,const PetscReal coords[],PetscBool cdist,PetscReal eta,PetscInt leafsize,PetscInt maxrank,PetscInt bs,PetscReal rtol,Mat * nA)1653d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1654d71ae5a4SJacob Faibussowitsch {
1655f17f7ee2SSatish Balay Mat A;
1656f17f7ee2SSatish Balay Mat_H2OPUS *h2opus;
1657f17f7ee2SSatish Balay MPI_Comm comm;
1658f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE;
1659f17f7ee2SSatish Balay
1660f17f7ee2SSatish Balay PetscFunctionBegin;
1661f17f7ee2SSatish Balay PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
1662f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B, spacedim, 2);
1663f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(B, cdist, 4);
1664f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(B, eta, 5);
1665f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B, leafsize, 6);
1666f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B, maxrank, 7);
1667f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B, bs, 8);
1668f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(B, rtol, 9);
16694f572ea9SToby Isaac PetscAssertPointer(nA, 10);
1670f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1671036e4bdcSSatish Balay PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1672036e4bdcSSatish Balay PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1673f17f7ee2SSatish Balay PetscCall(MatCreate(comm, &A));
1674f17f7ee2SSatish Balay PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1675f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1676f17f7ee2SSatish Balay {
1677f17f7ee2SSatish Balay VecType vtype;
16782592a002SStefano Zampini PetscBool isstd, iscuda, iskok;
1679f17f7ee2SSatish Balay
1680f17f7ee2SSatish Balay PetscCall(MatGetVecType(B, &vtype));
16812592a002SStefano Zampini PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
16822592a002SStefano Zampini PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
16832592a002SStefano Zampini PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
16842592a002SStefano Zampini PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1685f17f7ee2SSatish Balay if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
16862592a002SStefano Zampini if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1687f17f7ee2SSatish Balay }
1688f17f7ee2SSatish Balay #endif
1689f17f7ee2SSatish Balay PetscCall(MatSetType(A, MATH2OPUS));
1690f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A, boundtocpu));
169148a46eb9SPierre Jolivet if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1692f17f7ee2SSatish Balay PetscCall(MatPropagateSymmetryOptions(B, A));
1693f17f7ee2SSatish Balay /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1694f17f7ee2SSatish Balay
1695f17f7ee2SSatish Balay h2opus = (Mat_H2OPUS *)A->data;
1696f17f7ee2SSatish Balay h2opus->sampler = new PetscMatrixSampler(B);
1697f17f7ee2SSatish Balay if (eta > 0.) h2opus->eta = eta;
1698f17f7ee2SSatish Balay if (leafsize > 0) h2opus->leafsize = leafsize;
1699f17f7ee2SSatish Balay if (maxrank > 0) h2opus->max_rank = maxrank;
1700f17f7ee2SSatish Balay if (bs > 0) h2opus->bs = bs;
1701f17f7ee2SSatish Balay if (rtol > 0.) h2opus->rtol = rtol;
1702f17f7ee2SSatish Balay *nA = A;
1703f17f7ee2SSatish Balay A->preallocated = PETSC_TRUE;
17043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1705f17f7ee2SSatish Balay }
1706f17f7ee2SSatish Balay
1707cc4c1da9SBarry Smith /*@
1708f17f7ee2SSatish Balay MatH2OpusGetIndexMap - Access reordering index set.
1709f17f7ee2SSatish Balay
17102fe279fdSBarry Smith Input Parameter:
1711f17f7ee2SSatish Balay . A - the matrix
1712f17f7ee2SSatish Balay
1713f17f7ee2SSatish Balay Output Parameter:
1714f17f7ee2SSatish Balay . indexmap - the index set for the reordering
1715f17f7ee2SSatish Balay
1716f17f7ee2SSatish Balay Level: intermediate
1717f17f7ee2SSatish Balay
17181cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1719f17f7ee2SSatish Balay @*/
MatH2OpusGetIndexMap(Mat A,IS * indexmap)1720d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1721d71ae5a4SJacob Faibussowitsch {
1722f17f7ee2SSatish Balay PetscBool ish2opus;
1723f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1724f17f7ee2SSatish Balay
1725f17f7ee2SSatish Balay PetscFunctionBegin;
1726f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1727f17f7ee2SSatish Balay PetscValidType(A, 1);
17284f572ea9SToby Isaac PetscAssertPointer(indexmap, 2);
1729f17f7ee2SSatish Balay PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1730f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1731f17f7ee2SSatish Balay PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1732f17f7ee2SSatish Balay *indexmap = a->h2opus_indexmap;
17333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1734f17f7ee2SSatish Balay }
1735f17f7ee2SSatish Balay
1736cc4c1da9SBarry Smith /*@
1737f17f7ee2SSatish Balay MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1738f17f7ee2SSatish Balay
1739f17f7ee2SSatish Balay Input Parameters:
1740f17f7ee2SSatish Balay + A - the matrix
1741f17f7ee2SSatish Balay . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1742f17f7ee2SSatish Balay - in - the vector to be mapped
1743f17f7ee2SSatish Balay
1744f17f7ee2SSatish Balay Output Parameter:
1745f17f7ee2SSatish Balay . out - the newly created mapped vector
1746f17f7ee2SSatish Balay
1747f17f7ee2SSatish Balay Level: intermediate
1748f17f7ee2SSatish Balay
17491cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1750f17f7ee2SSatish Balay @*/
MatH2OpusMapVec(Mat A,PetscBool nativetopetsc,Vec in,Vec * out)1751d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1752d71ae5a4SJacob Faibussowitsch {
1753f17f7ee2SSatish Balay PetscBool ish2opus;
1754f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1755f17f7ee2SSatish Balay PetscScalar *xin, *xout;
1756f17f7ee2SSatish Balay PetscBool nm;
1757f17f7ee2SSatish Balay
1758f17f7ee2SSatish Balay PetscFunctionBegin;
1759f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1760f17f7ee2SSatish Balay PetscValidType(A, 1);
1761f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(A, nativetopetsc, 2);
1762f17f7ee2SSatish Balay PetscValidHeaderSpecific(in, VEC_CLASSID, 3);
17634f572ea9SToby Isaac PetscAssertPointer(out, 4);
1764f17f7ee2SSatish Balay PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1765f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1766f17f7ee2SSatish Balay PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1767f17f7ee2SSatish Balay nm = a->nativemult;
1768f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1769f17f7ee2SSatish Balay PetscCall(MatCreateVecs(A, out, NULL));
1770f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A, nm));
1771f17f7ee2SSatish Balay if (!a->sf) { /* same ordering */
1772f17f7ee2SSatish Balay PetscCall(VecCopy(in, *out));
17733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1774f17f7ee2SSatish Balay }
1775f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1776f17f7ee2SSatish Balay PetscCall(VecGetArrayWrite(*out, &xout));
1777f17f7ee2SSatish Balay if (nativetopetsc) {
1778f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1779f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1780f17f7ee2SSatish Balay } else {
1781f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1782f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1783f17f7ee2SSatish Balay }
1784f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1785f17f7ee2SSatish Balay PetscCall(VecRestoreArrayWrite(*out, &xout));
17863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1787f17f7ee2SSatish Balay }
1788f17f7ee2SSatish Balay
1789cc4c1da9SBarry Smith /*@
17901d27aa22SBarry Smith MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $
1791f17f7ee2SSatish Balay
1792f17f7ee2SSatish Balay Input Parameters:
179311a5261eSBarry Smith + A - the hierarchical `MATH2OPUS` matrix
1794f17f7ee2SSatish Balay . s - the scaling factor
1795f17f7ee2SSatish Balay . U - the dense low-rank update matrix
17962ef1f0ffSBarry Smith - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)
1797f17f7ee2SSatish Balay
179811a5261eSBarry Smith Note:
17991d27aa22SBarry Smith The `U` and `V` matrices must be in `MATDENSE` dense format
1800f17f7ee2SSatish Balay
1801f17f7ee2SSatish Balay Level: intermediate
1802f17f7ee2SSatish Balay
18031cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1804f17f7ee2SSatish Balay @*/
MatH2OpusLowRankUpdate(Mat A,Mat U,Mat V,PetscScalar s)1805d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1806d71ae5a4SJacob Faibussowitsch {
1807f17f7ee2SSatish Balay PetscBool flg;
1808f17f7ee2SSatish Balay
1809f17f7ee2SSatish Balay PetscFunctionBegin;
1810f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1811f17f7ee2SSatish Balay PetscValidType(A, 1);
1812f17f7ee2SSatish Balay PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1813f17f7ee2SSatish Balay PetscValidHeaderSpecific(U, MAT_CLASSID, 2);
1814f17f7ee2SSatish Balay PetscCheckSameComm(A, 1, U, 2);
1815f17f7ee2SSatish Balay if (V) {
1816f17f7ee2SSatish Balay PetscValidHeaderSpecific(V, MAT_CLASSID, 3);
1817f17f7ee2SSatish Balay PetscCheckSameComm(A, 1, V, 3);
1818f17f7ee2SSatish Balay }
1819f17f7ee2SSatish Balay PetscValidLogicalCollectiveScalar(A, s, 4);
1820f17f7ee2SSatish Balay
1821f17f7ee2SSatish Balay if (!V) V = U;
1822036e4bdcSSatish Balay PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
18233ba16761SJacob Faibussowitsch if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1824f17f7ee2SSatish Balay PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1825f17f7ee2SSatish Balay PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1826f17f7ee2SSatish Balay PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1827f17f7ee2SSatish Balay PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1828f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1829f17f7ee2SSatish Balay if (flg) {
1830f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1831f17f7ee2SSatish Balay const PetscScalar *u, *v, *uu, *vv;
1832f17f7ee2SSatish Balay PetscInt ldu, ldv;
1833f17f7ee2SSatish Balay PetscMPIInt size;
1834f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1835f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle;
1836f17f7ee2SSatish Balay #else
1837f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle;
1838f17f7ee2SSatish Balay #endif
1839f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1840f17f7ee2SSatish Balay PetscSF usf, vsf;
1841f17f7ee2SSatish Balay
1842f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1843036e4bdcSSatish Balay PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1844f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1845f17f7ee2SSatish Balay PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1846f17f7ee2SSatish Balay PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1847f17f7ee2SSatish Balay PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1848f17f7ee2SSatish Balay PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1849f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(U, &ldu));
1850f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(V, &ldv));
1851f17f7ee2SSatish Balay PetscCall(MatBoundToCPU(A, &flg));
1852f17f7ee2SSatish Balay if (usesf) {
1853f17f7ee2SSatish Balay PetscInt n;
1854f17f7ee2SSatish Balay
18550dd791a8SStefano Zampini PetscCall(MatDenseGetH2OpusStridedSF(U, a->sf, &usf));
18560dd791a8SStefano Zampini PetscCall(MatDenseGetH2OpusStridedSF(V, a->sf, &vsf));
1857f17f7ee2SSatish Balay PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1858f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1859f17f7ee2SSatish Balay ldu = n;
1860f17f7ee2SSatish Balay ldv = n;
1861f17f7ee2SSatish Balay }
1862f17f7ee2SSatish Balay if (flg) {
1863f17f7ee2SSatish Balay PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1864f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(U, &u));
1865f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(V, &v));
1866f17f7ee2SSatish Balay if (usesf) {
1867f17f7ee2SSatish Balay vv = MatH2OpusGetThrustPointer(*a->yy);
1868f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1869f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1870f17f7ee2SSatish Balay if (U != V) {
1871f17f7ee2SSatish Balay uu = MatH2OpusGetThrustPointer(*a->xx);
1872f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1873f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1874f17f7ee2SSatish Balay } else uu = vv;
18759371c9d4SSatish Balay } else {
18769371c9d4SSatish Balay uu = u;
18779371c9d4SSatish Balay vv = v;
18789371c9d4SSatish Balay }
1879f17f7ee2SSatish Balay hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1880f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(U, &u));
1881f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(V, &v));
1882f17f7ee2SSatish Balay } else {
1883f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1884f17f7ee2SSatish Balay PetscBool flgU, flgV;
1885f17f7ee2SSatish Balay
1886f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1887f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1888f17f7ee2SSatish Balay if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1889f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1890f17f7ee2SSatish Balay if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1891f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1892f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1893f17f7ee2SSatish Balay if (usesf) {
1894f17f7ee2SSatish Balay vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1895f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1896f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1897f17f7ee2SSatish Balay if (U != V) {
1898f17f7ee2SSatish Balay uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1899f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1900f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1901f17f7ee2SSatish Balay } else uu = vv;
19029371c9d4SSatish Balay } else {
19039371c9d4SSatish Balay uu = u;
19049371c9d4SSatish Balay vv = v;
19059371c9d4SSatish Balay }
1906f17f7ee2SSatish Balay #else
1907f17f7ee2SSatish Balay SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1908f17f7ee2SSatish Balay #endif
1909f17f7ee2SSatish Balay hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1910f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1911f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1912f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1913f17f7ee2SSatish Balay if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1914f17f7ee2SSatish Balay if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1915f17f7ee2SSatish Balay #endif
1916f17f7ee2SSatish Balay }
1917f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1918f17f7ee2SSatish Balay a->orthogonal = PETSC_FALSE;
1919f17f7ee2SSatish Balay }
19203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1921f17f7ee2SSatish Balay }
1922f17f7ee2SSatish Balay #endif
1923