1 #include <h2opusconf.h> 2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */ 3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU) 4 #include <h2opus.h> 5 #if defined(H2OPUS_USE_MPI) 6 #include <h2opus/distributed/distributed_h2opus_handle.h> 7 #include <h2opus/distributed/distributed_geometric_construction.h> 8 #include <h2opus/distributed/distributed_hgemv.h> 9 #include <h2opus/distributed/distributed_horthog.h> 10 #include <h2opus/distributed/distributed_hcompress.h> 11 #endif 12 #include <h2opus/util/boxentrygen.h> 13 #include <petsc/private/matimpl.h> 14 #include <petsc/private/vecimpl.h> 15 #include <petsc/private/deviceimpl.h> 16 #include <petscsf.h> 17 18 /* math2opusutils */ 19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *); 20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *); 21 PETSC_INTERN PetscErrorCode VecSign(Vec, Vec); 22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt); 23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *); 24 25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data()) 26 27 /* Use GPU only if H2OPUS is configured for GPU */ 28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU) 29 #define PETSC_H2OPUS_USE_GPU 30 #endif 31 #if defined(PETSC_H2OPUS_USE_GPU) 32 #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B))) 33 #else 34 #define MatH2OpusUpdateIfNeeded(A, B) 0 35 #endif 36 37 // TODO H2OPUS: 38 // DistributedHMatrix 39 // unsymmetric ? 40 // transpose for distributed_hgemv? 41 // clearData() 42 // Unify interface for sequential and parallel? 43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled) 44 // 45 template <class T> 46 class PetscPointCloud : public H2OpusDataSet<T> { 47 private: 48 int dimension; 49 size_t num_points; 50 std::vector<T> pts; 51 52 public: 53 PetscPointCloud(int dim, size_t num_pts, const T coords[]) { 54 dim = dim > 0 ? dim : 1; 55 this->dimension = dim; 56 this->num_points = num_pts; 57 58 pts.resize(num_pts * dim); 59 if (coords) { 60 for (size_t n = 0; n < num_pts; n++) 61 for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i]; 62 } else { 63 PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0; 64 for (size_t n = 0; n < num_pts; n++) { 65 pts[n * dim] = n * h; 66 for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0; 67 } 68 } 69 } 70 71 PetscPointCloud(const PetscPointCloud<T> &other) { 72 size_t N = other.dimension * other.num_points; 73 this->dimension = other.dimension; 74 this->num_points = other.num_points; 75 this->pts.resize(N); 76 for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i]; 77 } 78 79 int getDimension() const { return dimension; } 80 81 size_t getDataSetSize() const { return num_points; } 82 83 T getDataPoint(size_t idx, int dim) const { 84 assert(dim < dimension && idx < num_points); 85 return pts[idx * dimension + dim]; 86 } 87 88 void Print(std::ostream &out = std::cout) { 89 out << "Dimension: " << dimension << std::endl; 90 out << "NumPoints: " << num_points << std::endl; 91 for (size_t n = 0; n < num_points; n++) { 92 for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " "; 93 out << std::endl; 94 } 95 } 96 }; 97 98 template <class T> 99 class PetscFunctionGenerator { 100 private: 101 MatH2OpusKernel k; 102 int dim; 103 void *ctx; 104 105 public: 106 PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx) { 107 this->k = k; 108 this->dim = dim; 109 this->ctx = ctx; 110 } 111 PetscFunctionGenerator(PetscFunctionGenerator &other) { 112 this->k = other.k; 113 this->dim = other.dim; 114 this->ctx = other.ctx; 115 } 116 T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); } 117 }; 118 119 #include <../src/mat/impls/h2opus/math2opussampler.hpp> 120 121 /* just to not clutter the code */ 122 #if !defined(H2OPUS_USE_GPU) 123 typedef HMatrix HMatrix_GPU; 124 #if defined(H2OPUS_USE_MPI) 125 typedef DistributedHMatrix DistributedHMatrix_GPU; 126 #endif 127 #endif 128 129 typedef struct { 130 #if defined(H2OPUS_USE_MPI) 131 distributedH2OpusHandle_t handle; 132 #else 133 h2opusHandle_t handle; 134 #endif 135 /* Sequential and parallel matrices are two different classes at the moment */ 136 HMatrix *hmatrix; 137 #if defined(H2OPUS_USE_MPI) 138 DistributedHMatrix *dist_hmatrix; 139 #else 140 HMatrix *dist_hmatrix; /* just to not clutter the code */ 141 #endif 142 /* May use permutations */ 143 PetscSF sf; 144 PetscLayout h2opus_rmap, h2opus_cmap; 145 IS h2opus_indexmap; 146 thrust::host_vector<PetscScalar> *xx, *yy; 147 PetscInt xxs, yys; 148 PetscBool multsetup; 149 150 /* GPU */ 151 HMatrix_GPU *hmatrix_gpu; 152 #if defined(H2OPUS_USE_MPI) 153 DistributedHMatrix_GPU *dist_hmatrix_gpu; 154 #else 155 HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */ 156 #endif 157 #if defined(PETSC_H2OPUS_USE_GPU) 158 thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu; 159 PetscInt xxs_gpu, yys_gpu; 160 #endif 161 162 /* construction from matvecs */ 163 PetscMatrixSampler *sampler; 164 PetscBool nativemult; 165 166 /* Admissibility */ 167 PetscReal eta; 168 PetscInt leafsize; 169 170 /* for dof reordering */ 171 PetscPointCloud<PetscReal> *ptcloud; 172 173 /* kernel for generating matrix entries */ 174 PetscFunctionGenerator<PetscScalar> *kernel; 175 176 /* basis orthogonalized? */ 177 PetscBool orthogonal; 178 179 /* customization */ 180 PetscInt basisord; 181 PetscInt max_rank; 182 PetscInt bs; 183 PetscReal rtol; 184 PetscInt norm_max_samples; 185 PetscBool check_construction; 186 PetscBool hara_verbose; 187 PetscBool resize; 188 189 /* keeps track of MatScale values */ 190 PetscScalar s; 191 } Mat_H2OPUS; 192 193 static PetscErrorCode MatDestroy_H2OPUS(Mat A) { 194 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 195 196 PetscFunctionBegin; 197 #if defined(H2OPUS_USE_MPI) 198 h2opusDestroyDistributedHandle(a->handle); 199 #else 200 h2opusDestroyHandle(a->handle); 201 #endif 202 delete a->dist_hmatrix; 203 delete a->hmatrix; 204 PetscCall(PetscSFDestroy(&a->sf)); 205 PetscCall(PetscLayoutDestroy(&a->h2opus_rmap)); 206 PetscCall(PetscLayoutDestroy(&a->h2opus_cmap)); 207 PetscCall(ISDestroy(&a->h2opus_indexmap)); 208 delete a->xx; 209 delete a->yy; 210 delete a->hmatrix_gpu; 211 delete a->dist_hmatrix_gpu; 212 #if defined(PETSC_H2OPUS_USE_GPU) 213 delete a->xx_gpu; 214 delete a->yy_gpu; 215 #endif 216 delete a->sampler; 217 delete a->ptcloud; 218 delete a->kernel; 219 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL)); 220 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL)); 221 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL)); 222 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL)); 223 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 224 PetscCall(PetscFree(A->data)); 225 PetscFunctionReturn(0); 226 } 227 228 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) { 229 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 230 PetscBool ish2opus; 231 232 PetscFunctionBegin; 233 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 234 PetscValidLogicalCollectiveBool(A, nm, 2); 235 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 236 if (ish2opus) { 237 if (a->h2opus_rmap) { /* need to swap layouts for vector creation */ 238 if ((!a->nativemult && nm) || (a->nativemult && !nm)) { 239 PetscLayout t; 240 t = A->rmap; 241 A->rmap = a->h2opus_rmap; 242 a->h2opus_rmap = t; 243 t = A->cmap; 244 A->cmap = a->h2opus_cmap; 245 a->h2opus_cmap = t; 246 } 247 } 248 a->nativemult = nm; 249 } 250 PetscFunctionReturn(0); 251 } 252 253 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) { 254 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 255 PetscBool ish2opus; 256 257 PetscFunctionBegin; 258 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 259 PetscValidPointer(nm, 2); 260 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 261 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 262 *nm = a->nativemult; 263 PetscFunctionReturn(0); 264 } 265 266 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n) { 267 PetscBool ish2opus; 268 PetscInt nmax = PETSC_DECIDE; 269 Mat_H2OPUS *a = NULL; 270 PetscBool mult = PETSC_FALSE; 271 272 PetscFunctionBegin; 273 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 274 if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */ 275 a = (Mat_H2OPUS *)A->data; 276 277 nmax = a->norm_max_samples; 278 mult = a->nativemult; 279 PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE)); 280 } else { 281 PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL)); 282 } 283 PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n)); 284 if (a) PetscCall(MatH2OpusSetNativeMult(A, mult)); 285 PetscFunctionReturn(0); 286 } 287 288 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) { 289 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 290 PetscInt n; 291 PetscBool boundtocpu = PETSC_TRUE; 292 293 PetscFunctionBegin; 294 #if defined(PETSC_H2OPUS_USE_GPU) 295 boundtocpu = A->boundtocpu; 296 #endif 297 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 298 if (boundtocpu) { 299 if (h2opus->xxs < xN) { 300 h2opus->xx->resize(n * xN); 301 h2opus->xxs = xN; 302 } 303 if (h2opus->yys < yN) { 304 h2opus->yy->resize(n * yN); 305 h2opus->yys = yN; 306 } 307 } 308 #if defined(PETSC_H2OPUS_USE_GPU) 309 if (!boundtocpu) { 310 if (h2opus->xxs_gpu < xN) { 311 h2opus->xx_gpu->resize(n * xN); 312 h2opus->xxs_gpu = xN; 313 } 314 if (h2opus->yys_gpu < yN) { 315 h2opus->yy_gpu->resize(n * yN); 316 h2opus->yys_gpu = yN; 317 } 318 } 319 #endif 320 PetscFunctionReturn(0); 321 } 322 323 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) { 324 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 325 #if defined(H2OPUS_USE_MPI) 326 h2opusHandle_t handle = h2opus->handle->handle; 327 #else 328 h2opusHandle_t handle = h2opus->handle; 329 #endif 330 PetscBool boundtocpu = PETSC_TRUE; 331 PetscScalar *xx, *yy, *uxx, *uyy; 332 PetscInt blda, clda; 333 PetscMPIInt size; 334 PetscSF bsf, csf; 335 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 336 337 PetscFunctionBegin; 338 HLibProfile::clear(); 339 #if defined(PETSC_H2OPUS_USE_GPU) 340 boundtocpu = A->boundtocpu; 341 #endif 342 PetscCall(MatDenseGetLDA(B, &blda)); 343 PetscCall(MatDenseGetLDA(C, &clda)); 344 if (usesf) { 345 PetscInt n; 346 347 PetscCall(MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf)); 348 PetscCall(MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf)); 349 350 PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N)); 351 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 352 blda = n; 353 clda = n; 354 } 355 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 356 if (boundtocpu) { 357 PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx)); 358 PetscCall(MatDenseGetArrayWrite(C, &yy)); 359 if (usesf) { 360 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 361 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 362 PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 363 PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 364 } else { 365 uxx = xx; 366 uyy = yy; 367 } 368 if (size > 1) { 369 PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 370 PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 371 #if defined(H2OPUS_USE_MPI) 372 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 373 #endif 374 } else { 375 PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 376 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 377 } 378 PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx)); 379 if (usesf) { 380 PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 381 PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 382 } 383 PetscCall(MatDenseRestoreArrayWrite(C, &yy)); 384 #if defined(PETSC_H2OPUS_USE_GPU) 385 } else { 386 PetscBool ciscuda, biscuda; 387 388 /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */ 389 PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 390 if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B)); 391 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 392 if (!ciscuda) { 393 C->assembled = PETSC_TRUE; 394 PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C)); 395 } 396 PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx)); 397 PetscCall(MatDenseCUDAGetArrayWrite(C, &yy)); 398 if (usesf) { 399 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 400 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 401 PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 402 PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 403 } else { 404 uxx = xx; 405 uyy = yy; 406 } 407 PetscCall(PetscLogGpuTimeBegin()); 408 if (size > 1) { 409 PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix"); 410 PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 411 #if defined(H2OPUS_USE_MPI) 412 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 413 #endif 414 } else { 415 PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 416 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 417 } 418 PetscCall(PetscLogGpuTimeEnd()); 419 PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx)); 420 if (usesf) { 421 PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 422 PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 423 } 424 PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy)); 425 if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B)); 426 if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C)); 427 #endif 428 } 429 { /* log flops */ 430 double gops, time, perf, dev; 431 HLibProfile::getHgemvPerf(gops, time, perf, dev); 432 #if defined(PETSC_H2OPUS_USE_GPU) 433 if (boundtocpu) { 434 PetscCall(PetscLogFlops(1e9 * gops)); 435 } else { 436 PetscCall(PetscLogGpuFlops(1e9 * gops)); 437 } 438 #else 439 PetscCall(PetscLogFlops(1e9 * gops)); 440 #endif 441 } 442 PetscFunctionReturn(0); 443 } 444 445 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) { 446 Mat_Product *product = C->product; 447 448 PetscFunctionBegin; 449 MatCheckProduct(C, 1); 450 switch (product->type) { 451 case MATPRODUCT_AB: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C)); break; 452 case MATPRODUCT_AtB: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C)); break; 453 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]); 454 } 455 PetscFunctionReturn(0); 456 } 457 458 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) { 459 Mat_Product *product = C->product; 460 PetscBool cisdense; 461 Mat A, B; 462 463 PetscFunctionBegin; 464 MatCheckProduct(C, 1); 465 A = product->A; 466 B = product->B; 467 switch (product->type) { 468 case MATPRODUCT_AB: 469 PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N)); 470 PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B)); 471 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 472 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name)); 473 PetscCall(MatSetUp(C)); 474 break; 475 case MATPRODUCT_AtB: 476 PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N)); 477 PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B)); 478 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 479 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name)); 480 PetscCall(MatSetUp(C)); 481 break; 482 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]); 483 } 484 C->ops->productsymbolic = NULL; 485 C->ops->productnumeric = MatProductNumeric_H2OPUS; 486 PetscFunctionReturn(0); 487 } 488 489 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) { 490 PetscFunctionBegin; 491 MatCheckProduct(C, 1); 492 if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS; 493 PetscFunctionReturn(0); 494 } 495 496 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) { 497 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 498 #if defined(H2OPUS_USE_MPI) 499 h2opusHandle_t handle = h2opus->handle->handle; 500 #else 501 h2opusHandle_t handle = h2opus->handle; 502 #endif 503 PetscBool boundtocpu = PETSC_TRUE; 504 PetscInt n; 505 PetscScalar *xx, *yy, *uxx, *uyy; 506 PetscMPIInt size; 507 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 508 509 PetscFunctionBegin; 510 HLibProfile::clear(); 511 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 512 #if defined(PETSC_H2OPUS_USE_GPU) 513 boundtocpu = A->boundtocpu; 514 #endif 515 if (usesf) { 516 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 517 } else n = A->rmap->n; 518 if (boundtocpu) { 519 PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx)); 520 if (sy == 0.0) { 521 PetscCall(VecGetArrayWrite(y, &yy)); 522 } else { 523 PetscCall(VecGetArray(y, &yy)); 524 } 525 if (usesf) { 526 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 527 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 528 529 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 530 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 531 if (sy != 0.0) { 532 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 533 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 534 } 535 } else { 536 uxx = xx; 537 uyy = yy; 538 } 539 if (size > 1) { 540 PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 541 PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 542 #if defined(H2OPUS_USE_MPI) 543 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle); 544 #endif 545 } else { 546 PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 547 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle); 548 } 549 PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx)); 550 if (usesf) { 551 PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 552 PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 553 } 554 if (sy == 0.0) { 555 PetscCall(VecRestoreArrayWrite(y, &yy)); 556 } else { 557 PetscCall(VecRestoreArray(y, &yy)); 558 } 559 #if defined(PETSC_H2OPUS_USE_GPU) 560 } else { 561 PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx)); 562 if (sy == 0.0) { 563 PetscCall(VecCUDAGetArrayWrite(y, &yy)); 564 } else { 565 PetscCall(VecCUDAGetArray(y, &yy)); 566 } 567 if (usesf) { 568 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 569 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 570 571 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 572 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 573 if (sy != 0.0) { 574 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 575 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 576 } 577 } else { 578 uxx = xx; 579 uyy = yy; 580 } 581 PetscCall(PetscLogGpuTimeBegin()); 582 if (size > 1) { 583 PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix"); 584 PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 585 #if defined(H2OPUS_USE_MPI) 586 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle); 587 #endif 588 } else { 589 PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 590 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle); 591 } 592 PetscCall(PetscLogGpuTimeEnd()); 593 PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx)); 594 if (usesf) { 595 PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 596 PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 597 } 598 if (sy == 0.0) { 599 PetscCall(VecCUDARestoreArrayWrite(y, &yy)); 600 } else { 601 PetscCall(VecCUDARestoreArray(y, &yy)); 602 } 603 #endif 604 } 605 { /* log flops */ 606 double gops, time, perf, dev; 607 HLibProfile::getHgemvPerf(gops, time, perf, dev); 608 #if defined(PETSC_H2OPUS_USE_GPU) 609 if (boundtocpu) { 610 PetscCall(PetscLogFlops(1e9 * gops)); 611 } else { 612 PetscCall(PetscLogGpuFlops(1e9 * gops)); 613 } 614 #else 615 PetscCall(PetscLogFlops(1e9 * gops)); 616 #endif 617 } 618 PetscFunctionReturn(0); 619 } 620 621 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) { 622 PetscBool xiscuda, yiscuda; 623 624 PetscFunctionBegin; 625 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 626 PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "")); 627 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda)); 628 PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE)); 629 PetscFunctionReturn(0); 630 } 631 632 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) { 633 PetscBool xiscuda, yiscuda; 634 635 PetscFunctionBegin; 636 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 637 PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "")); 638 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda)); 639 PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE)); 640 PetscFunctionReturn(0); 641 } 642 643 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) { 644 PetscBool xiscuda, ziscuda; 645 646 PetscFunctionBegin; 647 PetscCall(VecCopy(y, z)); 648 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 649 PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "")); 650 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda)); 651 PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE)); 652 PetscFunctionReturn(0); 653 } 654 655 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) { 656 PetscBool xiscuda, ziscuda; 657 658 PetscFunctionBegin; 659 PetscCall(VecCopy(y, z)); 660 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 661 PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "")); 662 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda)); 663 PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE)); 664 PetscFunctionReturn(0); 665 } 666 667 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) { 668 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 669 670 PetscFunctionBegin; 671 a->s *= s; 672 PetscFunctionReturn(0); 673 } 674 675 static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject) { 676 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 677 678 PetscFunctionBegin; 679 PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options"); 680 PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL)); 681 PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL)); 682 PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL)); 683 PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL)); 684 PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL)); 685 PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum bumber of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL)); 686 PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL)); 687 PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL)); 688 PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL)); 689 PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL)); 690 PetscOptionsHeadEnd(); 691 PetscFunctionReturn(0); 692 } 693 694 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *); 695 696 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) { 697 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 698 Vec c; 699 PetscInt spacedim; 700 const PetscScalar *coords; 701 702 PetscFunctionBegin; 703 if (a->ptcloud) PetscFunctionReturn(0); 704 PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c)); 705 if (!c && a->sampler) { 706 Mat S = a->sampler->GetSamplingMat(); 707 708 PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c)); 709 } 710 if (!c) { 711 PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL)); 712 } else { 713 PetscCall(VecGetArrayRead(c, &coords)); 714 PetscCall(VecGetBlockSize(c, &spacedim)); 715 PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL)); 716 PetscCall(VecRestoreArrayRead(c, &coords)); 717 } 718 PetscFunctionReturn(0); 719 } 720 721 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) { 722 MPI_Comm comm; 723 PetscMPIInt size; 724 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 725 PetscInt n = 0, *idx = NULL; 726 int *iidx = NULL; 727 PetscCopyMode own; 728 PetscBool rid; 729 730 PetscFunctionBegin; 731 if (a->multsetup) PetscFunctionReturn(0); 732 if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */ 733 PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL)); 734 #if defined(PETSC_H2OPUS_USE_GPU) 735 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 736 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 737 a->xxs_gpu = 1; 738 a->yys_gpu = 1; 739 #endif 740 a->xx = new thrust::host_vector<PetscScalar>(n); 741 a->yy = new thrust::host_vector<PetscScalar>(n); 742 a->xxs = 1; 743 a->yys = 1; 744 } else { 745 IS is; 746 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 747 PetscCallMPI(MPI_Comm_size(comm, &size)); 748 if (!a->h2opus_indexmap) { 749 if (size > 1) { 750 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 751 #if defined(H2OPUS_USE_MPI) 752 iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map); 753 n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size(); 754 #endif 755 } else { 756 iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map); 757 n = a->hmatrix->u_basis_tree.index_map.size(); 758 } 759 760 if (PetscDefined(USE_64BIT_INDICES)) { 761 PetscInt i; 762 763 own = PETSC_OWN_POINTER; 764 PetscCall(PetscMalloc1(n, &idx)); 765 for (i = 0; i < n; i++) idx[i] = iidx[i]; 766 } else { 767 own = PETSC_COPY_VALUES; 768 idx = (PetscInt *)iidx; 769 } 770 PetscCall(ISCreateGeneral(comm, n, idx, own, &is)); 771 PetscCall(ISSetPermutation(is)); 772 PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view")); 773 a->h2opus_indexmap = is; 774 } 775 PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n)); 776 PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx)); 777 rid = (PetscBool)(n == A->rmap->n); 778 PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm)); 779 if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid)); 780 if (!rid) { 781 if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */ 782 PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap)); 783 PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n)); 784 PetscCall(PetscLayoutSetUp(a->h2opus_rmap)); 785 PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap)); 786 } 787 PetscCall(PetscSFCreate(comm, &a->sf)); 788 PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx)); 789 PetscCall(PetscSFSetUp(a->sf)); 790 PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view")); 791 #if defined(PETSC_H2OPUS_USE_GPU) 792 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 793 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 794 a->xxs_gpu = 1; 795 a->yys_gpu = 1; 796 #endif 797 a->xx = new thrust::host_vector<PetscScalar>(n); 798 a->yy = new thrust::host_vector<PetscScalar>(n); 799 a->xxs = 1; 800 a->yys = 1; 801 } 802 PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx)); 803 } 804 a->multsetup = PETSC_TRUE; 805 PetscFunctionReturn(0); 806 } 807 808 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) { 809 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 810 #if defined(H2OPUS_USE_MPI) 811 h2opusHandle_t handle = a->handle->handle; 812 #else 813 h2opusHandle_t handle = a->handle; 814 #endif 815 PetscBool kernel = PETSC_FALSE; 816 PetscBool boundtocpu = PETSC_TRUE; 817 PetscBool samplingdone = PETSC_FALSE; 818 MPI_Comm comm; 819 PetscMPIInt size; 820 821 PetscFunctionBegin; 822 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 823 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 824 PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 825 826 /* XXX */ 827 a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N)); 828 829 PetscCallMPI(MPI_Comm_size(comm, &size)); 830 /* TODO REUSABILITY of geometric construction */ 831 delete a->hmatrix; 832 delete a->dist_hmatrix; 833 #if defined(PETSC_H2OPUS_USE_GPU) 834 delete a->hmatrix_gpu; 835 delete a->dist_hmatrix_gpu; 836 #endif 837 a->orthogonal = PETSC_FALSE; 838 839 /* TODO: other? */ 840 H2OpusBoxCenterAdmissibility adm(a->eta); 841 842 PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0)); 843 if (size > 1) { 844 #if defined(H2OPUS_USE_MPI) 845 a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */); 846 #else 847 a->dist_hmatrix = NULL; 848 #endif 849 } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE); 850 PetscCall(MatH2OpusInferCoordinates_Private(A)); 851 PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud"); 852 if (a->kernel) { 853 BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel); 854 if (size > 1) { 855 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 856 #if defined(H2OPUS_USE_MPI) 857 buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle); 858 #endif 859 } else { 860 buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord); 861 } 862 kernel = PETSC_TRUE; 863 } else { 864 PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel"); 865 buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm); 866 } 867 PetscCall(MatSetUpMultiply_H2OPUS(A)); 868 869 #if defined(PETSC_H2OPUS_USE_GPU) 870 boundtocpu = A->boundtocpu; 871 if (!boundtocpu) { 872 if (size > 1) { 873 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 874 #if defined(H2OPUS_USE_MPI) 875 a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 876 #endif 877 } else { 878 a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 879 } 880 } 881 #endif 882 if (size == 1) { 883 if (!kernel && a->sampler && a->sampler->GetSamplingMat()) { 884 PetscReal Anorm; 885 bool verbose; 886 887 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL)); 888 verbose = a->hara_verbose; 889 PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm)); 890 if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs)); 891 if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data()); 892 a->sampler->SetStream(handle->getMainStream()); 893 if (boundtocpu) { 894 a->sampler->SetGPUSampling(false); 895 hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose); 896 #if defined(PETSC_H2OPUS_USE_GPU) 897 } else { 898 a->sampler->SetGPUSampling(true); 899 hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose); 900 #endif 901 } 902 samplingdone = PETSC_TRUE; 903 } 904 } 905 #if defined(PETSC_H2OPUS_USE_GPU) 906 if (!boundtocpu) { 907 delete a->hmatrix; 908 delete a->dist_hmatrix; 909 a->hmatrix = NULL; 910 a->dist_hmatrix = NULL; 911 } 912 A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 913 #endif 914 PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0)); 915 916 if (!a->s) a->s = 1.0; 917 A->assembled = PETSC_TRUE; 918 919 if (samplingdone) { 920 PetscBool check = a->check_construction; 921 PetscBool checke = PETSC_FALSE; 922 923 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL)); 924 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL)); 925 if (check) { 926 Mat E, Ae; 927 PetscReal n1, ni, n2; 928 PetscReal n1A, niA, n2A; 929 void (*normfunc)(void); 930 931 Ae = a->sampler->GetSamplingMat(); 932 PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E)); 933 PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS)); 934 PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN)); 935 PetscCall(MatNorm(E, NORM_1, &n1)); 936 PetscCall(MatNorm(E, NORM_INFINITY, &ni)); 937 PetscCall(MatNorm(E, NORM_2, &n2)); 938 if (checke) { 939 Mat eA, eE, eAe; 940 941 PetscCall(MatComputeOperator(A, MATAIJ, &eA)); 942 PetscCall(MatComputeOperator(E, MATAIJ, &eE)); 943 PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe)); 944 PetscCall(MatChop(eA, PETSC_SMALL)); 945 PetscCall(MatChop(eE, PETSC_SMALL)); 946 PetscCall(MatChop(eAe, PETSC_SMALL)); 947 PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat")); 948 PetscCall(MatView(eA, NULL)); 949 PetscCall(PetscObjectSetName((PetscObject)eAe, "S")); 950 PetscCall(MatView(eAe, NULL)); 951 PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S")); 952 PetscCall(MatView(eE, NULL)); 953 PetscCall(MatDestroy(&eA)); 954 PetscCall(MatDestroy(&eE)); 955 PetscCall(MatDestroy(&eAe)); 956 } 957 958 PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc)); 959 PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS)); 960 PetscCall(MatNorm(Ae, NORM_1, &n1A)); 961 PetscCall(MatNorm(Ae, NORM_INFINITY, &niA)); 962 PetscCall(MatNorm(Ae, NORM_2, &n2A)); 963 n1A = PetscMax(n1A, PETSC_SMALL); 964 n2A = PetscMax(n2A, PETSC_SMALL); 965 niA = PetscMax(niA, PETSC_SMALL); 966 PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc)); 967 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A))); 968 PetscCall(MatDestroy(&E)); 969 } 970 a->sampler->SetSamplingMat(NULL); 971 } 972 PetscFunctionReturn(0); 973 } 974 975 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) { 976 PetscMPIInt size; 977 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 978 979 PetscFunctionBegin; 980 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 981 PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported"); 982 a->hmatrix->clearData(); 983 #if defined(PETSC_H2OPUS_USE_GPU) 984 if (a->hmatrix_gpu) a->hmatrix_gpu->clearData(); 985 #endif 986 PetscFunctionReturn(0); 987 } 988 989 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) { 990 Mat A; 991 Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data; 992 #if defined(PETSC_H2OPUS_USE_GPU) 993 PetscBool iscpu = PETSC_FALSE; 994 #else 995 PetscBool iscpu = PETSC_TRUE; 996 #endif 997 MPI_Comm comm; 998 999 PetscFunctionBegin; 1000 PetscCall(PetscObjectGetComm((PetscObject)B, &comm)); 1001 PetscCall(MatCreate(comm, &A)); 1002 PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N)); 1003 PetscCall(MatSetType(A, MATH2OPUS)); 1004 PetscCall(MatPropagateSymmetryOptions(B, A)); 1005 a = (Mat_H2OPUS *)A->data; 1006 1007 a->eta = b->eta; 1008 a->leafsize = b->leafsize; 1009 a->basisord = b->basisord; 1010 a->max_rank = b->max_rank; 1011 a->bs = b->bs; 1012 a->rtol = b->rtol; 1013 a->norm_max_samples = b->norm_max_samples; 1014 if (op == MAT_COPY_VALUES) a->s = b->s; 1015 1016 a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud); 1017 if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel); 1018 1019 #if defined(H2OPUS_USE_MPI) 1020 if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); 1021 #if defined(PETSC_H2OPUS_USE_GPU) 1022 if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); 1023 #endif 1024 #endif 1025 if (b->hmatrix) { 1026 a->hmatrix = new HMatrix(*b->hmatrix); 1027 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData(); 1028 } 1029 #if defined(PETSC_H2OPUS_USE_GPU) 1030 if (b->hmatrix_gpu) { 1031 a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu); 1032 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData(); 1033 } 1034 #endif 1035 if (b->sf) { 1036 PetscCall(PetscObjectReference((PetscObject)b->sf)); 1037 a->sf = b->sf; 1038 } 1039 if (b->h2opus_indexmap) { 1040 PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap)); 1041 a->h2opus_indexmap = b->h2opus_indexmap; 1042 } 1043 1044 PetscCall(MatSetUp(A)); 1045 PetscCall(MatSetUpMultiply_H2OPUS(A)); 1046 if (op == MAT_COPY_VALUES) { 1047 A->assembled = PETSC_TRUE; 1048 a->orthogonal = b->orthogonal; 1049 #if defined(PETSC_H2OPUS_USE_GPU) 1050 A->offloadmask = B->offloadmask; 1051 #endif 1052 } 1053 #if defined(PETSC_H2OPUS_USE_GPU) 1054 iscpu = B->boundtocpu; 1055 #endif 1056 PetscCall(MatBindToCPU(A, iscpu)); 1057 1058 *nA = A; 1059 PetscFunctionReturn(0); 1060 } 1061 1062 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) { 1063 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 1064 PetscBool isascii, vieweps; 1065 PetscMPIInt size; 1066 PetscViewerFormat format; 1067 1068 PetscFunctionBegin; 1069 PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii)); 1070 PetscCall(PetscViewerGetFormat(view, &format)); 1071 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1072 if (isascii) { 1073 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1074 if (size == 1) { 1075 FILE *fp; 1076 PetscCall(PetscViewerASCIIGetPointer(view, &fp)); 1077 dumpHMatrix(*h2opus->hmatrix, 6, fp); 1078 } 1079 } else { 1080 PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat")); 1081 PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0)); 1082 PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta)); 1083 if (!h2opus->kernel) { 1084 PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol)); 1085 } else { 1086 PetscCall(PetscViewerASCIIPrintf(view, " Offdiagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord)); 1087 } 1088 PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples)); 1089 if (size == 1) { 1090 double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0; 1091 double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0; 1092 #if defined(PETSC_HAVE_CUDA) 1093 double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0; 1094 double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0; 1095 #endif 1096 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu)); 1097 #if defined(PETSC_HAVE_CUDA) 1098 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu)); 1099 #endif 1100 } else { 1101 #if defined(PETSC_HAVE_CUDA) 1102 double matrix_mem[4] = {0., 0., 0., 0.}; 1103 PetscMPIInt rsize = 4; 1104 #else 1105 double matrix_mem[2] = {0., 0.}; 1106 PetscMPIInt rsize = 2; 1107 #endif 1108 #if defined(H2OPUS_USE_MPI) 1109 matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0; 1110 matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0; 1111 #if defined(PETSC_HAVE_CUDA) 1112 matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0; 1113 matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0; 1114 #endif 1115 #endif 1116 PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A))); 1117 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1])); 1118 #if defined(PETSC_HAVE_CUDA) 1119 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3])); 1120 #endif 1121 } 1122 } 1123 } 1124 vieweps = PETSC_FALSE; 1125 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL)); 1126 if (vieweps) { 1127 char filename[256]; 1128 const char *name; 1129 1130 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1131 PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name)); 1132 PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL)); 1133 outputEps(*h2opus->hmatrix, filename); 1134 } 1135 PetscFunctionReturn(0); 1136 } 1137 1138 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx) { 1139 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 1140 PetscReal *gcoords; 1141 PetscInt N; 1142 MPI_Comm comm; 1143 PetscMPIInt size; 1144 PetscBool cong; 1145 1146 PetscFunctionBegin; 1147 PetscCall(PetscLayoutSetUp(A->rmap)); 1148 PetscCall(PetscLayoutSetUp(A->cmap)); 1149 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1150 PetscCall(MatHasCongruentLayouts(A, &cong)); 1151 PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts"); 1152 N = A->rmap->N; 1153 PetscCallMPI(MPI_Comm_size(comm, &size)); 1154 if (spacedim > 0 && size > 1 && cdist) { 1155 PetscSF sf; 1156 MPI_Datatype dtype; 1157 1158 PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype)); 1159 PetscCallMPI(MPI_Type_commit(&dtype)); 1160 1161 PetscCall(PetscSFCreate(comm, &sf)); 1162 PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER)); 1163 PetscCall(PetscMalloc1(spacedim * N, &gcoords)); 1164 PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE)); 1165 PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE)); 1166 PetscCall(PetscSFDestroy(&sf)); 1167 PetscCallMPI(MPI_Type_free(&dtype)); 1168 } else gcoords = (PetscReal *)coords; 1169 1170 delete h2opus->ptcloud; 1171 delete h2opus->kernel; 1172 h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords); 1173 if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx); 1174 if (gcoords != coords) PetscCall(PetscFree(gcoords)); 1175 A->preallocated = PETSC_TRUE; 1176 PetscFunctionReturn(0); 1177 } 1178 1179 #if defined(PETSC_H2OPUS_USE_GPU) 1180 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) { 1181 PetscMPIInt size; 1182 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1183 1184 PetscFunctionBegin; 1185 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1186 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) { 1187 if (size > 1) { 1188 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1189 #if defined(H2OPUS_USE_MPI) 1190 if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu); 1191 else *a->dist_hmatrix = *a->dist_hmatrix_gpu; 1192 #endif 1193 } else { 1194 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1195 if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu); 1196 else *a->hmatrix = *a->hmatrix_gpu; 1197 } 1198 delete a->hmatrix_gpu; 1199 delete a->dist_hmatrix_gpu; 1200 a->hmatrix_gpu = NULL; 1201 a->dist_hmatrix_gpu = NULL; 1202 A->offloadmask = PETSC_OFFLOAD_CPU; 1203 } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) { 1204 if (size > 1) { 1205 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1206 #if defined(H2OPUS_USE_MPI) 1207 if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 1208 else *a->dist_hmatrix_gpu = *a->dist_hmatrix; 1209 #endif 1210 } else { 1211 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1212 if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 1213 else *a->hmatrix_gpu = *a->hmatrix; 1214 } 1215 delete a->hmatrix; 1216 delete a->dist_hmatrix; 1217 a->hmatrix = NULL; 1218 a->dist_hmatrix = NULL; 1219 A->offloadmask = PETSC_OFFLOAD_GPU; 1220 } 1221 PetscCall(PetscFree(A->defaultvectype)); 1222 if (!flg) { 1223 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 1224 } else { 1225 PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype)); 1226 } 1227 A->boundtocpu = flg; 1228 PetscFunctionReturn(0); 1229 } 1230 #endif 1231 1232 /*MC 1233 MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package. 1234 1235 Options Database Keys: 1236 . -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions() 1237 1238 Notes: 1239 H2Opus implements hierarchical matrices in the H^2 flavour. It supports CPU or NVIDIA GPUs. 1240 1241 For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus. 1242 In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas. 1243 1244 Level: beginner 1245 1246 Reference: 1247 . * - "H2Opus: A distributed-memory multi-GPU software package for non-local operators", https://arxiv.org/abs/2109.05451 1248 1249 .seealso: `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()` 1250 M*/ 1251 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) { 1252 Mat_H2OPUS *a; 1253 PetscMPIInt size; 1254 1255 PetscFunctionBegin; 1256 #if defined(PETSC_H2OPUS_USE_GPU) 1257 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1258 #endif 1259 PetscCall(PetscNewLog(A, &a)); 1260 A->data = (void *)a; 1261 1262 a->eta = 0.9; 1263 a->leafsize = 32; 1264 a->basisord = 4; 1265 a->max_rank = 64; 1266 a->bs = 32; 1267 a->rtol = 1.e-4; 1268 a->s = 1.0; 1269 a->norm_max_samples = 10; 1270 a->resize = PETSC_TRUE; /* reallocate after compression */ 1271 #if defined(H2OPUS_USE_MPI) 1272 h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A)); 1273 #else 1274 h2opusCreateHandle(&a->handle); 1275 #endif 1276 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1277 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS)); 1278 PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps))); 1279 1280 A->ops->destroy = MatDestroy_H2OPUS; 1281 A->ops->view = MatView_H2OPUS; 1282 A->ops->assemblyend = MatAssemblyEnd_H2OPUS; 1283 A->ops->mult = MatMult_H2OPUS; 1284 A->ops->multtranspose = MatMultTranspose_H2OPUS; 1285 A->ops->multadd = MatMultAdd_H2OPUS; 1286 A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS; 1287 A->ops->scale = MatScale_H2OPUS; 1288 A->ops->duplicate = MatDuplicate_H2OPUS; 1289 A->ops->setfromoptions = MatSetFromOptions_H2OPUS; 1290 A->ops->norm = MatNorm_H2OPUS; 1291 A->ops->zeroentries = MatZeroEntries_H2OPUS; 1292 #if defined(PETSC_H2OPUS_USE_GPU) 1293 A->ops->bindtocpu = MatBindToCPU_H2OPUS; 1294 #endif 1295 1296 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS)); 1297 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS)); 1298 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS)); 1299 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS)); 1300 #if defined(PETSC_H2OPUS_USE_GPU) 1301 PetscCall(PetscFree(A->defaultvectype)); 1302 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 1303 #endif 1304 PetscFunctionReturn(0); 1305 } 1306 1307 /*@C 1308 MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix. 1309 1310 Input Parameter: 1311 . A - the matrix 1312 1313 Level: intermediate 1314 1315 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()` 1316 @*/ 1317 PetscErrorCode MatH2OpusOrthogonalize(Mat A) { 1318 PetscBool ish2opus; 1319 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1320 PetscMPIInt size; 1321 PetscBool boundtocpu = PETSC_TRUE; 1322 1323 PetscFunctionBegin; 1324 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1325 PetscValidType(A, 1); 1326 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1327 if (!ish2opus) PetscFunctionReturn(0); 1328 if (a->orthogonal) PetscFunctionReturn(0); 1329 HLibProfile::clear(); 1330 PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0)); 1331 #if defined(PETSC_H2OPUS_USE_GPU) 1332 boundtocpu = A->boundtocpu; 1333 #endif 1334 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1335 if (size > 1) { 1336 if (boundtocpu) { 1337 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1338 #if defined(H2OPUS_USE_MPI) 1339 distributed_horthog(*a->dist_hmatrix, a->handle); 1340 #endif 1341 #if defined(PETSC_H2OPUS_USE_GPU) 1342 A->offloadmask = PETSC_OFFLOAD_CPU; 1343 } else { 1344 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1345 PetscCall(PetscLogGpuTimeBegin()); 1346 #if defined(H2OPUS_USE_MPI) 1347 distributed_horthog(*a->dist_hmatrix_gpu, a->handle); 1348 #endif 1349 PetscCall(PetscLogGpuTimeEnd()); 1350 #endif 1351 } 1352 } else { 1353 #if defined(H2OPUS_USE_MPI) 1354 h2opusHandle_t handle = a->handle->handle; 1355 #else 1356 h2opusHandle_t handle = a->handle; 1357 #endif 1358 if (boundtocpu) { 1359 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1360 horthog(*a->hmatrix, handle); 1361 #if defined(PETSC_H2OPUS_USE_GPU) 1362 A->offloadmask = PETSC_OFFLOAD_CPU; 1363 } else { 1364 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1365 PetscCall(PetscLogGpuTimeBegin()); 1366 horthog(*a->hmatrix_gpu, handle); 1367 PetscCall(PetscLogGpuTimeEnd()); 1368 #endif 1369 } 1370 } 1371 a->orthogonal = PETSC_TRUE; 1372 { /* log flops */ 1373 double gops, time, perf, dev; 1374 HLibProfile::getHorthogPerf(gops, time, perf, dev); 1375 #if defined(PETSC_H2OPUS_USE_GPU) 1376 if (boundtocpu) { 1377 PetscCall(PetscLogFlops(1e9 * gops)); 1378 } else { 1379 PetscCall(PetscLogGpuFlops(1e9 * gops)); 1380 } 1381 #else 1382 PetscCall(PetscLogFlops(1e9 * gops)); 1383 #endif 1384 } 1385 PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0)); 1386 PetscFunctionReturn(0); 1387 } 1388 1389 /*@C 1390 MatH2OpusCompress - Compress a hierarchical matrix. 1391 1392 Input Parameters: 1393 + A - the matrix 1394 - tol - the absolute truncation threshold 1395 1396 Level: intermediate 1397 1398 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()` 1399 @*/ 1400 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) { 1401 PetscBool ish2opus; 1402 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1403 PetscMPIInt size; 1404 PetscBool boundtocpu = PETSC_TRUE; 1405 1406 PetscFunctionBegin; 1407 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1408 PetscValidType(A, 1); 1409 PetscValidLogicalCollectiveReal(A, tol, 2); 1410 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1411 if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0); 1412 PetscCall(MatH2OpusOrthogonalize(A)); 1413 HLibProfile::clear(); 1414 PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0)); 1415 #if defined(PETSC_H2OPUS_USE_GPU) 1416 boundtocpu = A->boundtocpu; 1417 #endif 1418 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1419 if (size > 1) { 1420 if (boundtocpu) { 1421 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1422 #if defined(H2OPUS_USE_MPI) 1423 distributed_hcompress(*a->dist_hmatrix, tol, a->handle); 1424 if (a->resize) { 1425 DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix); 1426 delete a->dist_hmatrix; 1427 a->dist_hmatrix = dist_hmatrix; 1428 } 1429 #endif 1430 #if defined(PETSC_H2OPUS_USE_GPU) 1431 A->offloadmask = PETSC_OFFLOAD_CPU; 1432 } else { 1433 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1434 PetscCall(PetscLogGpuTimeBegin()); 1435 #if defined(H2OPUS_USE_MPI) 1436 distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle); 1437 1438 if (a->resize) { 1439 DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu); 1440 delete a->dist_hmatrix_gpu; 1441 a->dist_hmatrix_gpu = dist_hmatrix_gpu; 1442 } 1443 #endif 1444 PetscCall(PetscLogGpuTimeEnd()); 1445 #endif 1446 } 1447 } else { 1448 #if defined(H2OPUS_USE_MPI) 1449 h2opusHandle_t handle = a->handle->handle; 1450 #else 1451 h2opusHandle_t handle = a->handle; 1452 #endif 1453 if (boundtocpu) { 1454 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1455 hcompress(*a->hmatrix, tol, handle); 1456 1457 if (a->resize) { 1458 HMatrix *hmatrix = new HMatrix(*a->hmatrix); 1459 delete a->hmatrix; 1460 a->hmatrix = hmatrix; 1461 } 1462 #if defined(PETSC_H2OPUS_USE_GPU) 1463 A->offloadmask = PETSC_OFFLOAD_CPU; 1464 } else { 1465 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1466 PetscCall(PetscLogGpuTimeBegin()); 1467 hcompress(*a->hmatrix_gpu, tol, handle); 1468 PetscCall(PetscLogGpuTimeEnd()); 1469 1470 if (a->resize) { 1471 HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu); 1472 delete a->hmatrix_gpu; 1473 a->hmatrix_gpu = hmatrix_gpu; 1474 } 1475 #endif 1476 } 1477 } 1478 { /* log flops */ 1479 double gops, time, perf, dev; 1480 HLibProfile::getHcompressPerf(gops, time, perf, dev); 1481 #if defined(PETSC_H2OPUS_USE_GPU) 1482 if (boundtocpu) { 1483 PetscCall(PetscLogFlops(1e9 * gops)); 1484 } else { 1485 PetscCall(PetscLogGpuFlops(1e9 * gops)); 1486 } 1487 #else 1488 PetscCall(PetscLogFlops(1e9 * gops)); 1489 #endif 1490 } 1491 PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0)); 1492 PetscFunctionReturn(0); 1493 } 1494 1495 /*@C 1496 MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix. 1497 1498 Input Parameters: 1499 + A - the hierarchical matrix 1500 . B - the matrix to be sampled 1501 . bs - maximum number of samples to be taken concurrently 1502 - tol - relative tolerance for construction 1503 1504 Notes: 1505 You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix. 1506 1507 Level: intermediate 1508 1509 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()` 1510 @*/ 1511 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) { 1512 PetscBool ish2opus; 1513 1514 PetscFunctionBegin; 1515 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1516 PetscValidType(A, 1); 1517 if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 1518 PetscValidLogicalCollectiveInt(A, bs, 3); 1519 PetscValidLogicalCollectiveReal(A, tol, 3); 1520 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1521 if (ish2opus) { 1522 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1523 1524 if (!a->sampler) a->sampler = new PetscMatrixSampler(); 1525 a->sampler->SetSamplingMat(B); 1526 if (bs > 0) a->bs = bs; 1527 if (tol > 0.) a->rtol = tol; 1528 delete a->kernel; 1529 } 1530 PetscFunctionReturn(0); 1531 } 1532 1533 /*@C 1534 MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel. 1535 1536 Input Parameters: 1537 + comm - MPI communicator 1538 . m - number of local rows (or `PETSC_DECIDE` to have calculated if M is given) 1539 . n - number of local columns (or `PETSC_DECIDE` to have calculated if N is given) 1540 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given) 1541 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given) 1542 . spacedim - dimension of the space coordinates 1543 . coords - coordinates of the points 1544 . cdist - whether or not coordinates are distributed 1545 . kernel - computational kernel (or NULL) 1546 . kernelctx - kernel context 1547 . eta - admissibility condition tolerance 1548 . leafsize - leaf size in cluster tree 1549 - basisord - approximation order for Chebychev interpolation of low-rank blocks 1550 1551 Output Parameter: 1552 . nA - matrix 1553 1554 Options Database Keys: 1555 + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree 1556 . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance 1557 . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order 1558 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms 1559 1560 Level: intermediate 1561 1562 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()` 1563 @*/ 1564 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA) { 1565 Mat A; 1566 Mat_H2OPUS *h2opus; 1567 #if defined(PETSC_H2OPUS_USE_GPU) 1568 PetscBool iscpu = PETSC_FALSE; 1569 #else 1570 PetscBool iscpu = PETSC_TRUE; 1571 #endif 1572 1573 PetscFunctionBegin; 1574 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 1575 PetscCall(MatCreate(comm, &A)); 1576 PetscCall(MatSetSizes(A, m, n, M, N)); 1577 PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 1578 PetscCall(MatSetType(A, MATH2OPUS)); 1579 PetscCall(MatBindToCPU(A, iscpu)); 1580 PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx)); 1581 1582 h2opus = (Mat_H2OPUS *)A->data; 1583 if (eta > 0.) h2opus->eta = eta; 1584 if (leafsize > 0) h2opus->leafsize = leafsize; 1585 if (basisord > 0) h2opus->basisord = basisord; 1586 1587 *nA = A; 1588 PetscFunctionReturn(0); 1589 } 1590 1591 /*@C 1592 MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator. 1593 1594 Input Parameters: 1595 + B - the matrix to be sampled 1596 . spacedim - dimension of the space coordinates 1597 . coords - coordinates of the points 1598 . cdist - whether or not coordinates are distributed 1599 . eta - admissibility condition tolerance 1600 . leafsize - leaf size in cluster tree 1601 . maxrank - maximum rank allowed 1602 . bs - maximum number of samples to be taken concurrently 1603 - rtol - relative tolerance for construction 1604 1605 Output Parameter: 1606 . nA - matrix 1607 1608 Options Database Keys: 1609 + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree 1610 . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance 1611 . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs 1612 . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs 1613 . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling 1614 . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd() 1615 . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction 1616 - -mat_h2opus_normsamples <`PetscInt`> - Maximum bumber of samples to be when estimating norms 1617 1618 Note: 1619 Not available in parallel 1620 1621 Level: intermediate 1622 1623 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()` 1624 @*/ 1625 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) { 1626 Mat A; 1627 Mat_H2OPUS *h2opus; 1628 MPI_Comm comm; 1629 PetscBool boundtocpu = PETSC_TRUE; 1630 1631 PetscFunctionBegin; 1632 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 1633 PetscValidLogicalCollectiveInt(B, spacedim, 2); 1634 PetscValidLogicalCollectiveBool(B, cdist, 4); 1635 PetscValidLogicalCollectiveReal(B, eta, 5); 1636 PetscValidLogicalCollectiveInt(B, leafsize, 6); 1637 PetscValidLogicalCollectiveInt(B, maxrank, 7); 1638 PetscValidLogicalCollectiveInt(B, bs, 8); 1639 PetscValidLogicalCollectiveReal(B, rtol, 9); 1640 PetscValidPointer(nA, 10); 1641 PetscCall(PetscObjectGetComm((PetscObject)B, &comm)); 1642 PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 1643 PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 1644 PetscCall(MatCreate(comm, &A)); 1645 PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N)); 1646 #if defined(PETSC_H2OPUS_USE_GPU) 1647 { 1648 PetscBool iscuda; 1649 VecType vtype; 1650 1651 PetscCall(MatGetVecType(B, &vtype)); 1652 PetscCall(PetscStrcmp(vtype, VECCUDA, &iscuda)); 1653 if (!iscuda) { 1654 PetscCall(PetscStrcmp(vtype, VECSEQCUDA, &iscuda)); 1655 if (!iscuda) PetscCall(PetscStrcmp(vtype, VECMPICUDA, &iscuda)); 1656 } 1657 if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE; 1658 } 1659 #endif 1660 PetscCall(MatSetType(A, MATH2OPUS)); 1661 PetscCall(MatBindToCPU(A, boundtocpu)); 1662 if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL)); 1663 PetscCall(MatPropagateSymmetryOptions(B, A)); 1664 /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */ 1665 1666 h2opus = (Mat_H2OPUS *)A->data; 1667 h2opus->sampler = new PetscMatrixSampler(B); 1668 if (eta > 0.) h2opus->eta = eta; 1669 if (leafsize > 0) h2opus->leafsize = leafsize; 1670 if (maxrank > 0) h2opus->max_rank = maxrank; 1671 if (bs > 0) h2opus->bs = bs; 1672 if (rtol > 0.) h2opus->rtol = rtol; 1673 *nA = A; 1674 A->preallocated = PETSC_TRUE; 1675 PetscFunctionReturn(0); 1676 } 1677 1678 /*@C 1679 MatH2OpusGetIndexMap - Access reordering index set. 1680 1681 Input Parameters: 1682 . A - the matrix 1683 1684 Output Parameter: 1685 . indexmap - the index set for the reordering 1686 1687 Level: intermediate 1688 1689 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1690 @*/ 1691 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) { 1692 PetscBool ish2opus; 1693 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1694 1695 PetscFunctionBegin; 1696 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1697 PetscValidType(A, 1); 1698 PetscValidPointer(indexmap, 2); 1699 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1700 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1701 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 1702 *indexmap = a->h2opus_indexmap; 1703 PetscFunctionReturn(0); 1704 } 1705 1706 /*@C 1707 MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering 1708 1709 Input Parameters: 1710 + A - the matrix 1711 . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map 1712 - in - the vector to be mapped 1713 1714 Output Parameter: 1715 . out - the newly created mapped vector 1716 1717 Level: intermediate 1718 1719 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1720 @*/ 1721 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out) { 1722 PetscBool ish2opus; 1723 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1724 PetscScalar *xin, *xout; 1725 PetscBool nm; 1726 1727 PetscFunctionBegin; 1728 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1729 PetscValidType(A, 1); 1730 PetscValidLogicalCollectiveBool(A, nativetopetsc, 2); 1731 PetscValidHeaderSpecific(in, VEC_CLASSID, 3); 1732 PetscValidPointer(out, 4); 1733 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1734 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1735 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 1736 nm = a->nativemult; 1737 PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc)); 1738 PetscCall(MatCreateVecs(A, out, NULL)); 1739 PetscCall(MatH2OpusSetNativeMult(A, nm)); 1740 if (!a->sf) { /* same ordering */ 1741 PetscCall(VecCopy(in, *out)); 1742 PetscFunctionReturn(0); 1743 } 1744 PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin)); 1745 PetscCall(VecGetArrayWrite(*out, &xout)); 1746 if (nativetopetsc) { 1747 PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1748 PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1749 } else { 1750 PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1751 PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1752 } 1753 PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin)); 1754 PetscCall(VecRestoreArrayWrite(*out, &xout)); 1755 PetscFunctionReturn(0); 1756 } 1757 1758 /*@C 1759 MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T 1760 1761 Input Parameters: 1762 + A - the hierarchical `MATH2OPUS` matrix 1763 . s - the scaling factor 1764 . U - the dense low-rank update matrix 1765 - V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed) 1766 1767 Note: 1768 The U and V matrices must be in dense format 1769 1770 Level: intermediate 1771 1772 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE` 1773 @*/ 1774 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) { 1775 PetscBool flg; 1776 1777 PetscFunctionBegin; 1778 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1779 PetscValidType(A, 1); 1780 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1781 PetscValidHeaderSpecific(U, MAT_CLASSID, 2); 1782 PetscCheckSameComm(A, 1, U, 2); 1783 if (V) { 1784 PetscValidHeaderSpecific(V, MAT_CLASSID, 3); 1785 PetscCheckSameComm(A, 1, V, 3); 1786 } 1787 PetscValidLogicalCollectiveScalar(A, s, 4); 1788 1789 if (!V) V = U; 1790 PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N); 1791 if (!U->cmap->N) PetscFunctionReturn(0); 1792 PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg)); 1793 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout"); 1794 PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg)); 1795 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout"); 1796 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg)); 1797 if (flg) { 1798 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1799 const PetscScalar *u, *v, *uu, *vv; 1800 PetscInt ldu, ldv; 1801 PetscMPIInt size; 1802 #if defined(H2OPUS_USE_MPI) 1803 h2opusHandle_t handle = a->handle->handle; 1804 #else 1805 h2opusHandle_t handle = a->handle; 1806 #endif 1807 PetscBool usesf = (PetscBool)(a->sf && !a->nativemult); 1808 PetscSF usf, vsf; 1809 1810 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1811 PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel"); 1812 PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0)); 1813 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, "")); 1814 PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name); 1815 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, "")); 1816 PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name); 1817 PetscCall(MatDenseGetLDA(U, &ldu)); 1818 PetscCall(MatDenseGetLDA(V, &ldv)); 1819 PetscCall(MatBoundToCPU(A, &flg)); 1820 if (usesf) { 1821 PetscInt n; 1822 1823 PetscCall(MatDenseGetH2OpusVectorSF(U, a->sf, &usf)); 1824 PetscCall(MatDenseGetH2OpusVectorSF(V, a->sf, &vsf)); 1825 PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N)); 1826 PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL)); 1827 ldu = n; 1828 ldv = n; 1829 } 1830 if (flg) { 1831 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1832 PetscCall(MatDenseGetArrayRead(U, &u)); 1833 PetscCall(MatDenseGetArrayRead(V, &v)); 1834 if (usesf) { 1835 vv = MatH2OpusGetThrustPointer(*a->yy); 1836 PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1837 PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1838 if (U != V) { 1839 uu = MatH2OpusGetThrustPointer(*a->xx); 1840 PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1841 PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1842 } else uu = vv; 1843 } else { 1844 uu = u; 1845 vv = v; 1846 } 1847 hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle); 1848 PetscCall(MatDenseRestoreArrayRead(U, &u)); 1849 PetscCall(MatDenseRestoreArrayRead(V, &v)); 1850 } else { 1851 #if defined(PETSC_H2OPUS_USE_GPU) 1852 PetscBool flgU, flgV; 1853 1854 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1855 PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, "")); 1856 if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U)); 1857 PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, "")); 1858 if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V)); 1859 PetscCall(MatDenseCUDAGetArrayRead(U, &u)); 1860 PetscCall(MatDenseCUDAGetArrayRead(V, &v)); 1861 if (usesf) { 1862 vv = MatH2OpusGetThrustPointer(*a->yy_gpu); 1863 PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1864 PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1865 if (U != V) { 1866 uu = MatH2OpusGetThrustPointer(*a->xx_gpu); 1867 PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1868 PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1869 } else uu = vv; 1870 } else { 1871 uu = u; 1872 vv = v; 1873 } 1874 #else 1875 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen"); 1876 #endif 1877 hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle); 1878 #if defined(PETSC_H2OPUS_USE_GPU) 1879 PetscCall(MatDenseCUDARestoreArrayRead(U, &u)); 1880 PetscCall(MatDenseCUDARestoreArrayRead(V, &v)); 1881 if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U)); 1882 if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V)); 1883 #endif 1884 } 1885 PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0)); 1886 a->orthogonal = PETSC_FALSE; 1887 } 1888 PetscFunctionReturn(0); 1889 } 1890 #endif 1891