1 #include <h2opusconf.h> 2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */ 3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU) 4 #include <h2opus.h> 5 #if defined(H2OPUS_USE_MPI) 6 #include <h2opus/distributed/distributed_h2opus_handle.h> 7 #include <h2opus/distributed/distributed_geometric_construction.h> 8 #include <h2opus/distributed/distributed_hgemv.h> 9 #include <h2opus/distributed/distributed_horthog.h> 10 #include <h2opus/distributed/distributed_hcompress.h> 11 #endif 12 #include <h2opus/util/boxentrygen.h> 13 #include <petsc/private/matimpl.h> 14 #include <petsc/private/vecimpl.h> 15 #include <petsc/private/deviceimpl.h> /*I "petscmat.h" I*/ 16 #include <petscsf.h> 17 18 /* math2opusutils */ 19 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusStridedSF(Mat, PetscSF, PetscSF *); 20 PETSC_INTERN PetscErrorCode VecSign(Vec, Vec); 21 PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt); 22 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *); 23 24 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data()) 25 26 /* Use GPU only if H2OPUS is configured for GPU */ 27 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU) 28 #define PETSC_H2OPUS_USE_GPU 29 #endif 30 #if defined(PETSC_H2OPUS_USE_GPU) 31 #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B))) 32 #else 33 #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS 34 #endif 35 36 // TODO H2OPUS: 37 // DistributedHMatrix 38 // unsymmetric ? 39 // transpose for distributed_hgemv? 40 // clearData() 41 // Unify interface for sequential and parallel? 42 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled) 43 // 44 template <class T> 45 class PetscPointCloud : public H2OpusDataSet<T> { 46 private: 47 int dimension; 48 size_t num_points; 49 std::vector<T> pts; 50 51 public: 52 PetscPointCloud(int dim, size_t num_pts, const T coords[]) 53 { 54 dim = dim > 0 ? dim : 1; 55 this->dimension = dim; 56 this->num_points = num_pts; 57 58 pts.resize(num_pts * dim); 59 if (coords) { 60 for (size_t n = 0; n < num_pts; n++) 61 for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i]; 62 } else { 63 PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0; 64 for (size_t n = 0; n < num_pts; n++) { 65 pts[n * dim] = n * h; 66 for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0; 67 } 68 } 69 } 70 71 PetscPointCloud(const PetscPointCloud<T> &other) 72 { 73 size_t N = other.dimension * other.num_points; 74 this->dimension = other.dimension; 75 this->num_points = other.num_points; 76 this->pts.resize(N); 77 for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i]; 78 } 79 80 int getDimension() const { return dimension; } 81 82 size_t getDataSetSize() const { return num_points; } 83 84 T getDataPoint(size_t idx, int dim) const 85 { 86 assert(dim < dimension && idx < num_points); 87 return pts[idx * dimension + dim]; 88 } 89 90 void Print(std::ostream &out = std::cout) 91 { 92 out << "Dimension: " << dimension << std::endl; 93 out << "NumPoints: " << num_points << std::endl; 94 for (size_t n = 0; n < num_points; n++) { 95 for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " "; 96 out << std::endl; 97 } 98 } 99 }; 100 101 template <class T> 102 class PetscFunctionGenerator { 103 private: 104 MatH2OpusKernelFn *k; 105 int dim; 106 void *ctx; 107 108 public: 109 PetscFunctionGenerator(MatH2OpusKernelFn *k, int dim, PetscCtx ctx) 110 { 111 this->k = k; 112 this->dim = dim; 113 this->ctx = ctx; 114 } 115 PetscFunctionGenerator(PetscFunctionGenerator &other) 116 { 117 this->k = other.k; 118 this->dim = other.dim; 119 this->ctx = other.ctx; 120 } 121 T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); } 122 }; 123 124 #include <../src/mat/impls/h2opus/math2opussampler.hpp> 125 126 /* just to not clutter the code */ 127 #if !defined(H2OPUS_USE_GPU) 128 typedef HMatrix HMatrix_GPU; 129 #if defined(H2OPUS_USE_MPI) 130 typedef DistributedHMatrix DistributedHMatrix_GPU; 131 #endif 132 #endif 133 134 typedef struct { 135 #if defined(H2OPUS_USE_MPI) 136 distributedH2OpusHandle_t handle; 137 #else 138 h2opusHandle_t handle; 139 #endif 140 /* Sequential and parallel matrices are two different classes at the moment */ 141 HMatrix *hmatrix; 142 #if defined(H2OPUS_USE_MPI) 143 DistributedHMatrix *dist_hmatrix; 144 #else 145 HMatrix *dist_hmatrix; /* just to not clutter the code */ 146 #endif 147 /* May use permutations */ 148 PetscSF sf; 149 PetscLayout h2opus_rmap, h2opus_cmap; 150 IS h2opus_indexmap; 151 thrust::host_vector<PetscScalar> *xx, *yy; 152 PetscInt xxs, yys; 153 PetscBool multsetup; 154 155 /* GPU */ 156 HMatrix_GPU *hmatrix_gpu; 157 #if defined(H2OPUS_USE_MPI) 158 DistributedHMatrix_GPU *dist_hmatrix_gpu; 159 #else 160 HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */ 161 #endif 162 #if defined(PETSC_H2OPUS_USE_GPU) 163 thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu; 164 PetscInt xxs_gpu, yys_gpu; 165 #endif 166 167 /* construction from matvecs */ 168 PetscMatrixSampler *sampler; 169 PetscBool nativemult; 170 171 /* Admissibility */ 172 PetscReal eta; 173 PetscInt leafsize; 174 175 /* for dof reordering */ 176 PetscPointCloud<PetscReal> *ptcloud; 177 178 /* kernel for generating matrix entries */ 179 PetscFunctionGenerator<PetscScalar> *kernel; 180 181 /* basis orthogonalized? */ 182 PetscBool orthogonal; 183 184 /* customization */ 185 PetscInt basisord; 186 PetscInt max_rank; 187 PetscInt bs; 188 PetscReal rtol; 189 PetscInt norm_max_samples; 190 PetscBool check_construction; 191 PetscBool hara_verbose; 192 PetscBool resize; 193 194 /* keeps track of MatScale values */ 195 PetscScalar s; 196 } Mat_H2OPUS; 197 198 static PetscErrorCode MatDestroy_H2OPUS(Mat A) 199 { 200 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 201 202 PetscFunctionBegin; 203 #if defined(H2OPUS_USE_MPI) 204 h2opusDestroyDistributedHandle(a->handle); 205 #else 206 h2opusDestroyHandle(a->handle); 207 #endif 208 delete a->dist_hmatrix; 209 delete a->hmatrix; 210 PetscCall(PetscSFDestroy(&a->sf)); 211 PetscCall(PetscLayoutDestroy(&a->h2opus_rmap)); 212 PetscCall(PetscLayoutDestroy(&a->h2opus_cmap)); 213 PetscCall(ISDestroy(&a->h2opus_indexmap)); 214 delete a->xx; 215 delete a->yy; 216 delete a->hmatrix_gpu; 217 delete a->dist_hmatrix_gpu; 218 #if defined(PETSC_H2OPUS_USE_GPU) 219 delete a->xx_gpu; 220 delete a->yy_gpu; 221 #endif 222 delete a->sampler; 223 delete a->ptcloud; 224 delete a->kernel; 225 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL)); 226 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL)); 227 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL)); 228 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL)); 229 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 230 PetscCall(PetscFree(A->data)); 231 PetscFunctionReturn(PETSC_SUCCESS); 232 } 233 234 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) 235 { 236 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 237 PetscBool ish2opus; 238 239 PetscFunctionBegin; 240 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 241 PetscValidLogicalCollectiveBool(A, nm, 2); 242 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 243 if (ish2opus) { 244 if (a->h2opus_rmap) { /* need to swap layouts for vector creation */ 245 if ((!a->nativemult && nm) || (a->nativemult && !nm)) { 246 PetscLayout t; 247 t = A->rmap; 248 A->rmap = a->h2opus_rmap; 249 a->h2opus_rmap = t; 250 t = A->cmap; 251 A->cmap = a->h2opus_cmap; 252 a->h2opus_cmap = t; 253 } 254 } 255 a->nativemult = nm; 256 } 257 PetscFunctionReturn(PETSC_SUCCESS); 258 } 259 260 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) 261 { 262 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 263 PetscBool ish2opus; 264 265 PetscFunctionBegin; 266 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 267 PetscAssertPointer(nm, 2); 268 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 269 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 270 *nm = a->nativemult; 271 PetscFunctionReturn(PETSC_SUCCESS); 272 } 273 274 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n) 275 { 276 PetscBool ish2opus; 277 PetscInt nmax = PETSC_DECIDE; 278 Mat_H2OPUS *a = NULL; 279 PetscBool mult = PETSC_FALSE; 280 281 PetscFunctionBegin; 282 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 283 if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */ 284 a = (Mat_H2OPUS *)A->data; 285 286 nmax = a->norm_max_samples; 287 mult = a->nativemult; 288 PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE)); 289 } else { 290 PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL)); 291 } 292 PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n)); 293 if (a) PetscCall(MatH2OpusSetNativeMult(A, mult)); 294 PetscFunctionReturn(PETSC_SUCCESS); 295 } 296 297 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) 298 { 299 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 300 PetscInt n; 301 PetscBool boundtocpu = PETSC_TRUE; 302 303 PetscFunctionBegin; 304 #if defined(PETSC_H2OPUS_USE_GPU) 305 boundtocpu = A->boundtocpu; 306 #endif 307 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 308 if (boundtocpu) { 309 if (h2opus->xxs < xN) { 310 h2opus->xx->resize(n * xN); 311 h2opus->xxs = xN; 312 } 313 if (h2opus->yys < yN) { 314 h2opus->yy->resize(n * yN); 315 h2opus->yys = yN; 316 } 317 } 318 #if defined(PETSC_H2OPUS_USE_GPU) 319 if (!boundtocpu) { 320 if (h2opus->xxs_gpu < xN) { 321 h2opus->xx_gpu->resize(n * xN); 322 h2opus->xxs_gpu = xN; 323 } 324 if (h2opus->yys_gpu < yN) { 325 h2opus->yy_gpu->resize(n * yN); 326 h2opus->yys_gpu = yN; 327 } 328 } 329 #endif 330 PetscFunctionReturn(PETSC_SUCCESS); 331 } 332 333 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) 334 { 335 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 336 #if defined(H2OPUS_USE_MPI) 337 h2opusHandle_t handle = h2opus->handle->handle; 338 #else 339 h2opusHandle_t handle = h2opus->handle; 340 #endif 341 PetscBool boundtocpu = PETSC_TRUE; 342 PetscScalar *xx, *yy, *uxx, *uyy; 343 PetscInt blda, clda; 344 PetscMPIInt size; 345 PetscSF bsf, csf; 346 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 347 348 PetscFunctionBegin; 349 HLibProfile::clear(); 350 #if defined(PETSC_H2OPUS_USE_GPU) 351 boundtocpu = A->boundtocpu; 352 #endif 353 PetscCall(MatDenseGetLDA(B, &blda)); 354 PetscCall(MatDenseGetLDA(C, &clda)); 355 if (usesf) { 356 PetscInt n; 357 358 PetscCall(MatDenseGetH2OpusStridedSF(B, h2opus->sf, &bsf)); 359 PetscCall(MatDenseGetH2OpusStridedSF(C, h2opus->sf, &csf)); 360 361 PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N)); 362 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 363 blda = n; 364 clda = n; 365 } 366 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 367 if (boundtocpu) { 368 PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx)); 369 PetscCall(MatDenseGetArrayWrite(C, &yy)); 370 if (usesf) { 371 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 372 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 373 PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 374 PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 375 } else { 376 uxx = xx; 377 uyy = yy; 378 } 379 if (size > 1) { 380 PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 381 PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 382 #if defined(H2OPUS_USE_MPI) 383 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 384 #endif 385 } else { 386 PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 387 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 388 } 389 PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx)); 390 if (usesf) { 391 PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 392 PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 393 } 394 PetscCall(MatDenseRestoreArrayWrite(C, &yy)); 395 #if defined(PETSC_H2OPUS_USE_GPU) 396 } else { 397 PetscBool ciscuda, biscuda; 398 399 /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */ 400 PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 401 if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B)); 402 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 403 if (!ciscuda) { 404 C->assembled = PETSC_TRUE; 405 PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C)); 406 } 407 PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx)); 408 PetscCall(MatDenseCUDAGetArrayWrite(C, &yy)); 409 if (usesf) { 410 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 411 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 412 PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 413 PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 414 } else { 415 uxx = xx; 416 uyy = yy; 417 } 418 PetscCall(PetscLogGpuTimeBegin()); 419 if (size > 1) { 420 PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix"); 421 PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 422 #if defined(H2OPUS_USE_MPI) 423 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 424 #endif 425 } else { 426 PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 427 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 428 } 429 PetscCall(PetscLogGpuTimeEnd()); 430 PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx)); 431 if (usesf) { 432 PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 433 PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 434 } 435 PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy)); 436 if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B)); 437 if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C)); 438 #endif 439 } 440 { /* log flops */ 441 double gops, time, perf, dev; 442 HLibProfile::getHgemvPerf(gops, time, perf, dev); 443 #if defined(PETSC_H2OPUS_USE_GPU) 444 if (boundtocpu) { 445 PetscCall(PetscLogFlops(1e9 * gops)); 446 } else { 447 PetscCall(PetscLogGpuFlops(1e9 * gops)); 448 } 449 #else 450 PetscCall(PetscLogFlops(1e9 * gops)); 451 #endif 452 } 453 PetscFunctionReturn(PETSC_SUCCESS); 454 } 455 456 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) 457 { 458 Mat_Product *product = C->product; 459 460 PetscFunctionBegin; 461 MatCheckProduct(C, 1); 462 switch (product->type) { 463 case MATPRODUCT_AB: 464 PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C)); 465 break; 466 case MATPRODUCT_AtB: 467 PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C)); 468 break; 469 default: 470 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]); 471 } 472 PetscFunctionReturn(PETSC_SUCCESS); 473 } 474 475 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) 476 { 477 Mat_Product *product = C->product; 478 PetscBool cisdense; 479 Mat A, B; 480 481 PetscFunctionBegin; 482 MatCheckProduct(C, 1); 483 A = product->A; 484 B = product->B; 485 switch (product->type) { 486 case MATPRODUCT_AB: 487 PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N)); 488 PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B)); 489 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 490 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name)); 491 PetscCall(MatSetUp(C)); 492 break; 493 case MATPRODUCT_AtB: 494 PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N)); 495 PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B)); 496 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 497 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name)); 498 PetscCall(MatSetUp(C)); 499 break; 500 default: 501 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]); 502 } 503 C->ops->productsymbolic = NULL; 504 C->ops->productnumeric = MatProductNumeric_H2OPUS; 505 PetscFunctionReturn(PETSC_SUCCESS); 506 } 507 508 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) 509 { 510 PetscFunctionBegin; 511 MatCheckProduct(C, 1); 512 if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS; 513 PetscFunctionReturn(PETSC_SUCCESS); 514 } 515 516 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) 517 { 518 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 519 #if defined(H2OPUS_USE_MPI) 520 h2opusHandle_t handle = h2opus->handle->handle; 521 #else 522 h2opusHandle_t handle = h2opus->handle; 523 #endif 524 PetscBool boundtocpu = PETSC_TRUE; 525 PetscInt n; 526 PetscScalar *xx, *yy, *uxx, *uyy; 527 PetscMPIInt size; 528 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 529 530 PetscFunctionBegin; 531 HLibProfile::clear(); 532 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 533 #if defined(PETSC_H2OPUS_USE_GPU) 534 boundtocpu = A->boundtocpu; 535 #endif 536 if (usesf) PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 537 else n = A->rmap->n; 538 if (boundtocpu) { 539 PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx)); 540 if (sy == 0.0) { 541 PetscCall(VecGetArrayWrite(y, &yy)); 542 } else { 543 PetscCall(VecGetArray(y, &yy)); 544 } 545 if (usesf) { 546 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 547 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 548 549 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 550 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 551 if (sy != 0.0) { 552 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 553 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 554 } 555 } else { 556 uxx = xx; 557 uyy = yy; 558 } 559 if (size > 1) { 560 PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 561 PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 562 #if defined(H2OPUS_USE_MPI) 563 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle); 564 #endif 565 } else { 566 PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 567 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle); 568 } 569 PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx)); 570 if (usesf) { 571 PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 572 PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 573 } 574 if (sy == 0.0) { 575 PetscCall(VecRestoreArrayWrite(y, &yy)); 576 } else { 577 PetscCall(VecRestoreArray(y, &yy)); 578 } 579 #if defined(PETSC_H2OPUS_USE_GPU) 580 } else { 581 PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx)); 582 if (sy == 0.0) { 583 PetscCall(VecCUDAGetArrayWrite(y, &yy)); 584 } else { 585 PetscCall(VecCUDAGetArray(y, &yy)); 586 } 587 if (usesf) { 588 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 589 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 590 591 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 592 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 593 if (sy != 0.0) { 594 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 595 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 596 } 597 } else { 598 uxx = xx; 599 uyy = yy; 600 } 601 PetscCall(PetscLogGpuTimeBegin()); 602 if (size > 1) { 603 PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix"); 604 PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 605 #if defined(H2OPUS_USE_MPI) 606 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle); 607 #endif 608 } else { 609 PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 610 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle); 611 } 612 PetscCall(PetscLogGpuTimeEnd()); 613 PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx)); 614 if (usesf) { 615 PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 616 PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 617 } 618 if (sy == 0.0) { 619 PetscCall(VecCUDARestoreArrayWrite(y, &yy)); 620 } else { 621 PetscCall(VecCUDARestoreArray(y, &yy)); 622 } 623 #endif 624 } 625 { /* log flops */ 626 double gops, time, perf, dev; 627 HLibProfile::getHgemvPerf(gops, time, perf, dev); 628 #if defined(PETSC_H2OPUS_USE_GPU) 629 if (boundtocpu) { 630 PetscCall(PetscLogFlops(1e9 * gops)); 631 } else { 632 PetscCall(PetscLogGpuFlops(1e9 * gops)); 633 } 634 #else 635 PetscCall(PetscLogFlops(1e9 * gops)); 636 #endif 637 } 638 PetscFunctionReturn(PETSC_SUCCESS); 639 } 640 641 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) 642 { 643 PetscBool xiscuda, yiscuda; 644 645 PetscFunctionBegin; 646 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 647 PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "")); 648 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda)); 649 PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE)); 650 PetscFunctionReturn(PETSC_SUCCESS); 651 } 652 653 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) 654 { 655 PetscBool xiscuda, yiscuda; 656 657 PetscFunctionBegin; 658 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 659 PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "")); 660 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda)); 661 PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE)); 662 PetscFunctionReturn(PETSC_SUCCESS); 663 } 664 665 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 666 { 667 PetscBool xiscuda, ziscuda; 668 669 PetscFunctionBegin; 670 PetscCall(VecCopy(y, z)); 671 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 672 PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "")); 673 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda)); 674 PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE)); 675 PetscFunctionReturn(PETSC_SUCCESS); 676 } 677 678 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 679 { 680 PetscBool xiscuda, ziscuda; 681 682 PetscFunctionBegin; 683 PetscCall(VecCopy(y, z)); 684 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 685 PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "")); 686 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda)); 687 PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE)); 688 PetscFunctionReturn(PETSC_SUCCESS); 689 } 690 691 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) 692 { 693 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 694 695 PetscFunctionBegin; 696 a->s *= s; 697 PetscFunctionReturn(PETSC_SUCCESS); 698 } 699 700 static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems PetscOptionsObject) 701 { 702 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 703 704 PetscFunctionBegin; 705 PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options"); 706 PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL)); 707 PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL)); 708 PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL)); 709 PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL)); 710 PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL)); 711 PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL)); 712 PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL)); 713 PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL)); 714 PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL)); 715 PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL)); 716 PetscOptionsHeadEnd(); 717 PetscFunctionReturn(PETSC_SUCCESS); 718 } 719 720 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernelFn *, void *); 721 722 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) 723 { 724 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 725 Vec c; 726 PetscInt spacedim; 727 const PetscScalar *coords; 728 729 PetscFunctionBegin; 730 if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS); 731 PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c)); 732 if (!c && a->sampler) { 733 Mat S = a->sampler->GetSamplingMat(); 734 735 PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c)); 736 } 737 if (!c) { 738 PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL)); 739 } else { 740 PetscCall(VecGetArrayRead(c, &coords)); 741 PetscCall(VecGetBlockSize(c, &spacedim)); 742 PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL)); 743 PetscCall(VecRestoreArrayRead(c, &coords)); 744 } 745 PetscFunctionReturn(PETSC_SUCCESS); 746 } 747 748 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) 749 { 750 MPI_Comm comm; 751 PetscMPIInt size; 752 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 753 PetscInt n = 0, *idx = NULL; 754 int *iidx = NULL; 755 PetscCopyMode own; 756 PetscBool rid; 757 758 PetscFunctionBegin; 759 if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS); 760 if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */ 761 PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL)); 762 #if defined(PETSC_H2OPUS_USE_GPU) 763 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 764 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 765 a->xxs_gpu = 1; 766 a->yys_gpu = 1; 767 #endif 768 a->xx = new thrust::host_vector<PetscScalar>(n); 769 a->yy = new thrust::host_vector<PetscScalar>(n); 770 a->xxs = 1; 771 a->yys = 1; 772 } else { 773 IS is; 774 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 775 PetscCallMPI(MPI_Comm_size(comm, &size)); 776 if (!a->h2opus_indexmap) { 777 if (size > 1) { 778 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 779 #if defined(H2OPUS_USE_MPI) 780 iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map); 781 n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size(); 782 #endif 783 } else { 784 iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map); 785 n = a->hmatrix->u_basis_tree.index_map.size(); 786 } 787 788 if (PetscDefined(USE_64BIT_INDICES)) { 789 PetscInt i; 790 791 own = PETSC_OWN_POINTER; 792 PetscCall(PetscMalloc1(n, &idx)); 793 for (i = 0; i < n; i++) idx[i] = iidx[i]; 794 } else { 795 own = PETSC_COPY_VALUES; 796 idx = (PetscInt *)iidx; 797 } 798 PetscCall(ISCreateGeneral(comm, n, idx, own, &is)); 799 PetscCall(ISSetPermutation(is)); 800 PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view")); 801 a->h2opus_indexmap = is; 802 } 803 PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n)); 804 PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx)); 805 rid = (PetscBool)(n == A->rmap->n); 806 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPI_C_BOOL, MPI_LAND, comm)); 807 if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid)); 808 if (!rid) { 809 if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */ 810 PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap)); 811 PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n)); 812 PetscCall(PetscLayoutSetUp(a->h2opus_rmap)); 813 PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap)); 814 } 815 PetscCall(PetscSFCreate(comm, &a->sf)); 816 PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx)); 817 PetscCall(PetscSFSetUp(a->sf)); 818 PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view")); 819 #if defined(PETSC_H2OPUS_USE_GPU) 820 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 821 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 822 a->xxs_gpu = 1; 823 a->yys_gpu = 1; 824 #endif 825 a->xx = new thrust::host_vector<PetscScalar>(n); 826 a->yy = new thrust::host_vector<PetscScalar>(n); 827 a->xxs = 1; 828 a->yys = 1; 829 } 830 PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx)); 831 } 832 a->multsetup = PETSC_TRUE; 833 PetscFunctionReturn(PETSC_SUCCESS); 834 } 835 836 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) 837 { 838 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 839 #if defined(H2OPUS_USE_MPI) 840 h2opusHandle_t handle = a->handle->handle; 841 #else 842 h2opusHandle_t handle = a->handle; 843 #endif 844 PetscBool kernel = PETSC_FALSE; 845 PetscBool boundtocpu = PETSC_TRUE; 846 PetscBool samplingdone = PETSC_FALSE; 847 MPI_Comm comm; 848 PetscMPIInt size; 849 850 PetscFunctionBegin; 851 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 852 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 853 PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 854 855 /* XXX */ 856 a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N)); 857 858 PetscCallMPI(MPI_Comm_size(comm, &size)); 859 /* TODO REUSABILITY of geometric construction */ 860 delete a->hmatrix; 861 delete a->dist_hmatrix; 862 #if defined(PETSC_H2OPUS_USE_GPU) 863 delete a->hmatrix_gpu; 864 delete a->dist_hmatrix_gpu; 865 #endif 866 a->orthogonal = PETSC_FALSE; 867 868 /* TODO: other? */ 869 H2OpusBoxCenterAdmissibility adm(a->eta); 870 871 PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0)); 872 if (size > 1) { 873 #if defined(H2OPUS_USE_MPI) 874 a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */); 875 #else 876 a->dist_hmatrix = NULL; 877 #endif 878 } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE); 879 PetscCall(MatH2OpusInferCoordinates_Private(A)); 880 PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud"); 881 if (a->kernel) { 882 BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel); 883 if (size > 1) { 884 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 885 #if defined(H2OPUS_USE_MPI) 886 buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle); 887 #endif 888 } else { 889 buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord); 890 } 891 kernel = PETSC_TRUE; 892 } else { 893 PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel"); 894 buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm); 895 } 896 PetscCall(MatSetUpMultiply_H2OPUS(A)); 897 898 #if defined(PETSC_H2OPUS_USE_GPU) 899 boundtocpu = A->boundtocpu; 900 if (!boundtocpu) { 901 if (size > 1) { 902 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 903 #if defined(H2OPUS_USE_MPI) 904 a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 905 #endif 906 } else { 907 a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 908 } 909 } 910 #endif 911 if (size == 1) { 912 if (!kernel && a->sampler && a->sampler->GetSamplingMat()) { 913 PetscReal Anorm; 914 bool verbose; 915 916 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL)); 917 verbose = a->hara_verbose; 918 PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm)); 919 if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs)); 920 if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data()); 921 a->sampler->SetStream(handle->getMainStream()); 922 if (boundtocpu) { 923 a->sampler->SetGPUSampling(false); 924 hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose); 925 #if defined(PETSC_H2OPUS_USE_GPU) 926 } else { 927 a->sampler->SetGPUSampling(true); 928 hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose); 929 #endif 930 } 931 samplingdone = PETSC_TRUE; 932 } 933 } 934 #if defined(PETSC_H2OPUS_USE_GPU) 935 if (!boundtocpu) { 936 delete a->hmatrix; 937 delete a->dist_hmatrix; 938 a->hmatrix = NULL; 939 a->dist_hmatrix = NULL; 940 } 941 A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 942 #endif 943 PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0)); 944 945 if (!a->s) a->s = 1.0; 946 A->assembled = PETSC_TRUE; 947 948 if (samplingdone) { 949 PetscBool check = a->check_construction; 950 PetscBool checke = PETSC_FALSE; 951 952 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL)); 953 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL)); 954 if (check) { 955 Mat E, Ae; 956 PetscReal n1, ni, n2; 957 PetscReal n1A, niA, n2A; 958 PetscErrorCodeFn *normfunc; 959 960 Ae = a->sampler->GetSamplingMat(); 961 PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E)); 962 PetscCall(MatShellSetOperation(E, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS)); 963 PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN)); 964 PetscCall(MatNorm(E, NORM_1, &n1)); 965 PetscCall(MatNorm(E, NORM_INFINITY, &ni)); 966 PetscCall(MatNorm(E, NORM_2, &n2)); 967 if (checke) { 968 Mat eA, eE, eAe; 969 970 PetscCall(MatComputeOperator(A, MATAIJ, &eA)); 971 PetscCall(MatComputeOperator(E, MATAIJ, &eE)); 972 PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe)); 973 PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE)); 974 PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE)); 975 PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE)); 976 PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat")); 977 PetscCall(MatView(eA, NULL)); 978 PetscCall(PetscObjectSetName((PetscObject)eAe, "S")); 979 PetscCall(MatView(eAe, NULL)); 980 PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S")); 981 PetscCall(MatView(eE, NULL)); 982 PetscCall(MatDestroy(&eA)); 983 PetscCall(MatDestroy(&eE)); 984 PetscCall(MatDestroy(&eAe)); 985 } 986 987 PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc)); 988 PetscCall(MatSetOperation(Ae, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS)); 989 PetscCall(MatNorm(Ae, NORM_1, &n1A)); 990 PetscCall(MatNorm(Ae, NORM_INFINITY, &niA)); 991 PetscCall(MatNorm(Ae, NORM_2, &n2A)); 992 n1A = PetscMax(n1A, PETSC_SMALL); 993 n2A = PetscMax(n2A, PETSC_SMALL); 994 niA = PetscMax(niA, PETSC_SMALL); 995 PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc)); 996 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A))); 997 PetscCall(MatDestroy(&E)); 998 } 999 a->sampler->SetSamplingMat(NULL); 1000 } 1001 PetscFunctionReturn(PETSC_SUCCESS); 1002 } 1003 1004 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) 1005 { 1006 PetscMPIInt size; 1007 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1008 1009 PetscFunctionBegin; 1010 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1011 PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported"); 1012 a->hmatrix->clearData(); 1013 #if defined(PETSC_H2OPUS_USE_GPU) 1014 if (a->hmatrix_gpu) a->hmatrix_gpu->clearData(); 1015 #endif 1016 PetscFunctionReturn(PETSC_SUCCESS); 1017 } 1018 1019 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) 1020 { 1021 Mat A; 1022 Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data; 1023 #if defined(PETSC_H2OPUS_USE_GPU) 1024 PetscBool iscpu = PETSC_FALSE; 1025 #else 1026 PetscBool iscpu = PETSC_TRUE; 1027 #endif 1028 MPI_Comm comm; 1029 1030 PetscFunctionBegin; 1031 PetscCall(PetscObjectGetComm((PetscObject)B, &comm)); 1032 PetscCall(MatCreate(comm, &A)); 1033 PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N)); 1034 PetscCall(MatSetType(A, MATH2OPUS)); 1035 PetscCall(MatPropagateSymmetryOptions(B, A)); 1036 a = (Mat_H2OPUS *)A->data; 1037 1038 a->eta = b->eta; 1039 a->leafsize = b->leafsize; 1040 a->basisord = b->basisord; 1041 a->max_rank = b->max_rank; 1042 a->bs = b->bs; 1043 a->rtol = b->rtol; 1044 a->norm_max_samples = b->norm_max_samples; 1045 if (op == MAT_COPY_VALUES) a->s = b->s; 1046 1047 a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud); 1048 if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel); 1049 1050 #if defined(H2OPUS_USE_MPI) 1051 if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); 1052 #if defined(PETSC_H2OPUS_USE_GPU) 1053 if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); 1054 #endif 1055 #endif 1056 if (b->hmatrix) { 1057 a->hmatrix = new HMatrix(*b->hmatrix); 1058 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData(); 1059 } 1060 #if defined(PETSC_H2OPUS_USE_GPU) 1061 if (b->hmatrix_gpu) { 1062 a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu); 1063 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData(); 1064 } 1065 #endif 1066 if (b->sf) { 1067 PetscCall(PetscObjectReference((PetscObject)b->sf)); 1068 a->sf = b->sf; 1069 } 1070 if (b->h2opus_indexmap) { 1071 PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap)); 1072 a->h2opus_indexmap = b->h2opus_indexmap; 1073 } 1074 1075 PetscCall(MatSetUp(A)); 1076 PetscCall(MatSetUpMultiply_H2OPUS(A)); 1077 if (op == MAT_COPY_VALUES) { 1078 A->assembled = PETSC_TRUE; 1079 a->orthogonal = b->orthogonal; 1080 #if defined(PETSC_H2OPUS_USE_GPU) 1081 A->offloadmask = B->offloadmask; 1082 #endif 1083 } 1084 #if defined(PETSC_H2OPUS_USE_GPU) 1085 iscpu = B->boundtocpu; 1086 #endif 1087 PetscCall(MatBindToCPU(A, iscpu)); 1088 1089 *nA = A; 1090 PetscFunctionReturn(PETSC_SUCCESS); 1091 } 1092 1093 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) 1094 { 1095 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 1096 PetscBool isascii, vieweps; 1097 PetscMPIInt size; 1098 PetscViewerFormat format; 1099 1100 PetscFunctionBegin; 1101 PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii)); 1102 PetscCall(PetscViewerGetFormat(view, &format)); 1103 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1104 if (isascii) { 1105 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1106 if (size == 1) { 1107 FILE *fp; 1108 PetscCall(PetscViewerASCIIGetPointer(view, &fp)); 1109 dumpHMatrix(*h2opus->hmatrix, 6, fp); 1110 } 1111 } else { 1112 PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat")); 1113 PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0)); 1114 PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta)); 1115 if (!h2opus->kernel) { 1116 PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol)); 1117 } else { 1118 PetscCall(PetscViewerASCIIPrintf(view, " Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord)); 1119 } 1120 PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples)); 1121 if (size == 1) { 1122 double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0; 1123 double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0; 1124 #if defined(PETSC_HAVE_CUDA) 1125 double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0; 1126 double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0; 1127 #endif 1128 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu)); 1129 #if defined(PETSC_HAVE_CUDA) 1130 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu)); 1131 #endif 1132 } else { 1133 #if defined(PETSC_HAVE_CUDA) 1134 double matrix_mem[4] = {0., 0., 0., 0.}; 1135 PetscMPIInt rsize = 4; 1136 #else 1137 double matrix_mem[2] = {0., 0.}; 1138 PetscMPIInt rsize = 2; 1139 #endif 1140 #if defined(H2OPUS_USE_MPI) 1141 matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0; 1142 matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0; 1143 #if defined(PETSC_HAVE_CUDA) 1144 matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0; 1145 matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0; 1146 #endif 1147 #endif 1148 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A))); 1149 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1])); 1150 #if defined(PETSC_HAVE_CUDA) 1151 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3])); 1152 #endif 1153 } 1154 } 1155 } 1156 vieweps = PETSC_FALSE; 1157 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL)); 1158 if (vieweps) { 1159 char filename[256]; 1160 const char *name; 1161 1162 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1163 PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name)); 1164 PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL)); 1165 outputEps(*h2opus->hmatrix, filename); 1166 } 1167 PetscFunctionReturn(PETSC_SUCCESS); 1168 } 1169 1170 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx) 1171 { 1172 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 1173 PetscReal *gcoords; 1174 PetscInt N; 1175 MPI_Comm comm; 1176 PetscMPIInt size; 1177 PetscBool cong; 1178 1179 PetscFunctionBegin; 1180 PetscCall(PetscLayoutSetUp(A->rmap)); 1181 PetscCall(PetscLayoutSetUp(A->cmap)); 1182 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1183 PetscCall(MatHasCongruentLayouts(A, &cong)); 1184 PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts"); 1185 N = A->rmap->N; 1186 PetscCallMPI(MPI_Comm_size(comm, &size)); 1187 if (spacedim > 0 && size > 1 && cdist) { 1188 PetscSF sf; 1189 MPI_Datatype dtype; 1190 1191 PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype)); 1192 PetscCallMPI(MPI_Type_commit(&dtype)); 1193 1194 PetscCall(PetscSFCreate(comm, &sf)); 1195 PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER)); 1196 PetscCall(PetscMalloc1(spacedim * N, &gcoords)); 1197 PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE)); 1198 PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE)); 1199 PetscCall(PetscSFDestroy(&sf)); 1200 PetscCallMPI(MPI_Type_free(&dtype)); 1201 } else gcoords = (PetscReal *)coords; 1202 1203 delete h2opus->ptcloud; 1204 delete h2opus->kernel; 1205 h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords); 1206 if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx); 1207 if (gcoords != coords) PetscCall(PetscFree(gcoords)); 1208 A->preallocated = PETSC_TRUE; 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 1212 #if defined(PETSC_H2OPUS_USE_GPU) 1213 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) 1214 { 1215 PetscMPIInt size; 1216 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1217 1218 PetscFunctionBegin; 1219 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1220 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) { 1221 if (size > 1) { 1222 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1223 #if defined(H2OPUS_USE_MPI) 1224 if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu); 1225 else *a->dist_hmatrix = *a->dist_hmatrix_gpu; 1226 #endif 1227 } else { 1228 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1229 if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu); 1230 else *a->hmatrix = *a->hmatrix_gpu; 1231 } 1232 delete a->hmatrix_gpu; 1233 delete a->dist_hmatrix_gpu; 1234 a->hmatrix_gpu = NULL; 1235 a->dist_hmatrix_gpu = NULL; 1236 A->offloadmask = PETSC_OFFLOAD_CPU; 1237 } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) { 1238 if (size > 1) { 1239 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1240 #if defined(H2OPUS_USE_MPI) 1241 if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 1242 else *a->dist_hmatrix_gpu = *a->dist_hmatrix; 1243 #endif 1244 } else { 1245 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1246 if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 1247 else *a->hmatrix_gpu = *a->hmatrix; 1248 } 1249 delete a->hmatrix; 1250 delete a->dist_hmatrix; 1251 a->hmatrix = NULL; 1252 a->dist_hmatrix = NULL; 1253 A->offloadmask = PETSC_OFFLOAD_GPU; 1254 } 1255 PetscCall(PetscFree(A->defaultvectype)); 1256 if (!flg) { 1257 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 1258 } else { 1259 PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype)); 1260 } 1261 A->boundtocpu = flg; 1262 PetscFunctionReturn(PETSC_SUCCESS); 1263 } 1264 #endif 1265 1266 /*MC 1267 MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`. 1268 1269 Options Database Key: 1270 . -mat_type h2opus - matrix type to "h2opus" 1271 1272 Level: beginner 1273 1274 Notes: 1275 H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs. 1276 1277 For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus. 1278 In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`. 1279 1280 .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()` 1281 M*/ 1282 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) 1283 { 1284 Mat_H2OPUS *a; 1285 PetscMPIInt size; 1286 1287 PetscFunctionBegin; 1288 #if defined(PETSC_H2OPUS_USE_GPU) 1289 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1290 #endif 1291 PetscCall(PetscNew(&a)); 1292 A->data = (void *)a; 1293 1294 a->eta = 0.9; 1295 a->leafsize = 32; 1296 a->basisord = 4; 1297 a->max_rank = 64; 1298 a->bs = 32; 1299 a->rtol = 1.e-4; 1300 a->s = 1.0; 1301 a->norm_max_samples = 10; 1302 a->resize = PETSC_TRUE; /* reallocate after compression */ 1303 #if defined(H2OPUS_USE_MPI) 1304 h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A)); 1305 #else 1306 h2opusCreateHandle(&a->handle); 1307 #endif 1308 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1309 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS)); 1310 PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps))); 1311 1312 A->ops->destroy = MatDestroy_H2OPUS; 1313 A->ops->view = MatView_H2OPUS; 1314 A->ops->assemblyend = MatAssemblyEnd_H2OPUS; 1315 A->ops->mult = MatMult_H2OPUS; 1316 A->ops->multtranspose = MatMultTranspose_H2OPUS; 1317 A->ops->multadd = MatMultAdd_H2OPUS; 1318 A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS; 1319 A->ops->scale = MatScale_H2OPUS; 1320 A->ops->duplicate = MatDuplicate_H2OPUS; 1321 A->ops->setfromoptions = MatSetFromOptions_H2OPUS; 1322 A->ops->norm = MatNorm_H2OPUS; 1323 A->ops->zeroentries = MatZeroEntries_H2OPUS; 1324 #if defined(PETSC_H2OPUS_USE_GPU) 1325 A->ops->bindtocpu = MatBindToCPU_H2OPUS; 1326 #endif 1327 1328 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS)); 1329 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS)); 1330 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS)); 1331 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS)); 1332 #if defined(PETSC_H2OPUS_USE_GPU) 1333 PetscCall(PetscFree(A->defaultvectype)); 1334 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 1335 #endif 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 /*@ 1340 MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix. 1341 1342 Input Parameter: 1343 . A - the matrix 1344 1345 Level: intermediate 1346 1347 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()` 1348 @*/ 1349 PetscErrorCode MatH2OpusOrthogonalize(Mat A) 1350 { 1351 PetscBool ish2opus; 1352 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1353 PetscMPIInt size; 1354 PetscBool boundtocpu = PETSC_TRUE; 1355 1356 PetscFunctionBegin; 1357 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1358 PetscValidType(A, 1); 1359 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1360 if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS); 1361 if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS); 1362 HLibProfile::clear(); 1363 PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0)); 1364 #if defined(PETSC_H2OPUS_USE_GPU) 1365 boundtocpu = A->boundtocpu; 1366 #endif 1367 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1368 if (size > 1) { 1369 if (boundtocpu) { 1370 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1371 #if defined(H2OPUS_USE_MPI) 1372 distributed_horthog(*a->dist_hmatrix, a->handle); 1373 #endif 1374 #if defined(PETSC_H2OPUS_USE_GPU) 1375 A->offloadmask = PETSC_OFFLOAD_CPU; 1376 } else { 1377 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1378 PetscCall(PetscLogGpuTimeBegin()); 1379 #if defined(H2OPUS_USE_MPI) 1380 distributed_horthog(*a->dist_hmatrix_gpu, a->handle); 1381 #endif 1382 PetscCall(PetscLogGpuTimeEnd()); 1383 #endif 1384 } 1385 } else { 1386 #if defined(H2OPUS_USE_MPI) 1387 h2opusHandle_t handle = a->handle->handle; 1388 #else 1389 h2opusHandle_t handle = a->handle; 1390 #endif 1391 if (boundtocpu) { 1392 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1393 horthog(*a->hmatrix, handle); 1394 #if defined(PETSC_H2OPUS_USE_GPU) 1395 A->offloadmask = PETSC_OFFLOAD_CPU; 1396 } else { 1397 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1398 PetscCall(PetscLogGpuTimeBegin()); 1399 horthog(*a->hmatrix_gpu, handle); 1400 PetscCall(PetscLogGpuTimeEnd()); 1401 #endif 1402 } 1403 } 1404 a->orthogonal = PETSC_TRUE; 1405 { /* log flops */ 1406 double gops, time, perf, dev; 1407 HLibProfile::getHorthogPerf(gops, time, perf, dev); 1408 #if defined(PETSC_H2OPUS_USE_GPU) 1409 if (boundtocpu) { 1410 PetscCall(PetscLogFlops(1e9 * gops)); 1411 } else { 1412 PetscCall(PetscLogGpuFlops(1e9 * gops)); 1413 } 1414 #else 1415 PetscCall(PetscLogFlops(1e9 * gops)); 1416 #endif 1417 } 1418 PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0)); 1419 PetscFunctionReturn(PETSC_SUCCESS); 1420 } 1421 1422 /*@ 1423 MatH2OpusCompress - Compress a hierarchical matrix. 1424 1425 Input Parameters: 1426 + A - the matrix 1427 - tol - the absolute truncation threshold 1428 1429 Level: intermediate 1430 1431 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()` 1432 @*/ 1433 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) 1434 { 1435 PetscBool ish2opus; 1436 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1437 PetscMPIInt size; 1438 PetscBool boundtocpu = PETSC_TRUE; 1439 1440 PetscFunctionBegin; 1441 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1442 PetscValidType(A, 1); 1443 PetscValidLogicalCollectiveReal(A, tol, 2); 1444 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1445 if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS); 1446 PetscCall(MatH2OpusOrthogonalize(A)); 1447 HLibProfile::clear(); 1448 PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0)); 1449 #if defined(PETSC_H2OPUS_USE_GPU) 1450 boundtocpu = A->boundtocpu; 1451 #endif 1452 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1453 if (size > 1) { 1454 if (boundtocpu) { 1455 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1456 #if defined(H2OPUS_USE_MPI) 1457 distributed_hcompress(*a->dist_hmatrix, tol, a->handle); 1458 if (a->resize) { 1459 DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix); 1460 delete a->dist_hmatrix; 1461 a->dist_hmatrix = dist_hmatrix; 1462 } 1463 #endif 1464 #if defined(PETSC_H2OPUS_USE_GPU) 1465 A->offloadmask = PETSC_OFFLOAD_CPU; 1466 } else { 1467 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1468 PetscCall(PetscLogGpuTimeBegin()); 1469 #if defined(H2OPUS_USE_MPI) 1470 distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle); 1471 1472 if (a->resize) { 1473 DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu); 1474 delete a->dist_hmatrix_gpu; 1475 a->dist_hmatrix_gpu = dist_hmatrix_gpu; 1476 } 1477 #endif 1478 PetscCall(PetscLogGpuTimeEnd()); 1479 #endif 1480 } 1481 } else { 1482 #if defined(H2OPUS_USE_MPI) 1483 h2opusHandle_t handle = a->handle->handle; 1484 #else 1485 h2opusHandle_t handle = a->handle; 1486 #endif 1487 if (boundtocpu) { 1488 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1489 hcompress(*a->hmatrix, tol, handle); 1490 1491 if (a->resize) { 1492 HMatrix *hmatrix = new HMatrix(*a->hmatrix); 1493 delete a->hmatrix; 1494 a->hmatrix = hmatrix; 1495 } 1496 #if defined(PETSC_H2OPUS_USE_GPU) 1497 A->offloadmask = PETSC_OFFLOAD_CPU; 1498 } else { 1499 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1500 PetscCall(PetscLogGpuTimeBegin()); 1501 hcompress(*a->hmatrix_gpu, tol, handle); 1502 PetscCall(PetscLogGpuTimeEnd()); 1503 1504 if (a->resize) { 1505 HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu); 1506 delete a->hmatrix_gpu; 1507 a->hmatrix_gpu = hmatrix_gpu; 1508 } 1509 #endif 1510 } 1511 } 1512 { /* log flops */ 1513 double gops, time, perf, dev; 1514 HLibProfile::getHcompressPerf(gops, time, perf, dev); 1515 #if defined(PETSC_H2OPUS_USE_GPU) 1516 if (boundtocpu) { 1517 PetscCall(PetscLogFlops(1e9 * gops)); 1518 } else { 1519 PetscCall(PetscLogGpuFlops(1e9 * gops)); 1520 } 1521 #else 1522 PetscCall(PetscLogFlops(1e9 * gops)); 1523 #endif 1524 } 1525 PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0)); 1526 PetscFunctionReturn(PETSC_SUCCESS); 1527 } 1528 1529 /*@ 1530 MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix. 1531 1532 Input Parameters: 1533 + A - the hierarchical matrix 1534 . B - the matrix to be sampled 1535 . bs - maximum number of samples to be taken concurrently 1536 - tol - relative tolerance for construction 1537 1538 Level: intermediate 1539 1540 Notes: 1541 You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix. 1542 1543 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()` 1544 @*/ 1545 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) 1546 { 1547 PetscBool ish2opus; 1548 1549 PetscFunctionBegin; 1550 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1551 PetscValidType(A, 1); 1552 if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 1553 PetscValidLogicalCollectiveInt(A, bs, 3); 1554 PetscValidLogicalCollectiveReal(A, tol, 4); 1555 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1556 if (ish2opus) { 1557 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1558 1559 if (!a->sampler) a->sampler = new PetscMatrixSampler(); 1560 a->sampler->SetSamplingMat(B); 1561 if (bs > 0) a->bs = bs; 1562 if (tol > 0.) a->rtol = tol; 1563 delete a->kernel; 1564 } 1565 PetscFunctionReturn(PETSC_SUCCESS); 1566 } 1567 1568 /*@C 1569 MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel. 1570 1571 Input Parameters: 1572 + comm - MPI communicator 1573 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 1574 . n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given) 1575 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 1576 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 1577 . spacedim - dimension of the space coordinates 1578 . coords - coordinates of the points 1579 . cdist - whether or not coordinates are distributed 1580 . kernel - computational kernel (or `NULL`) 1581 . kernelctx - kernel context 1582 . eta - admissibility condition tolerance 1583 . leafsize - leaf size in cluster tree 1584 - basisord - approximation order for Chebychev interpolation of low-rank blocks 1585 1586 Output Parameter: 1587 . nA - matrix 1588 1589 Options Database Keys: 1590 + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree 1591 . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance 1592 . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order 1593 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms 1594 1595 Level: intermediate 1596 1597 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()` 1598 @*/ 1599 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA) 1600 { 1601 Mat A; 1602 Mat_H2OPUS *h2opus; 1603 #if defined(PETSC_H2OPUS_USE_GPU) 1604 PetscBool iscpu = PETSC_FALSE; 1605 #else 1606 PetscBool iscpu = PETSC_TRUE; 1607 #endif 1608 1609 PetscFunctionBegin; 1610 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 1611 PetscCall(MatCreate(comm, &A)); 1612 PetscCall(MatSetSizes(A, m, n, M, N)); 1613 PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 1614 PetscCall(MatSetType(A, MATH2OPUS)); 1615 PetscCall(MatBindToCPU(A, iscpu)); 1616 PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx)); 1617 1618 h2opus = (Mat_H2OPUS *)A->data; 1619 if (eta > 0.) h2opus->eta = eta; 1620 if (leafsize > 0) h2opus->leafsize = leafsize; 1621 if (basisord > 0) h2opus->basisord = basisord; 1622 1623 *nA = A; 1624 PetscFunctionReturn(PETSC_SUCCESS); 1625 } 1626 1627 /*@ 1628 MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator. 1629 1630 Input Parameters: 1631 + B - the matrix to be sampled 1632 . spacedim - dimension of the space coordinates 1633 . coords - coordinates of the points 1634 . cdist - whether or not coordinates are distributed 1635 . eta - admissibility condition tolerance 1636 . leafsize - leaf size in cluster tree 1637 . maxrank - maximum rank allowed 1638 . bs - maximum number of samples to be taken concurrently 1639 - rtol - relative tolerance for construction 1640 1641 Output Parameter: 1642 . nA - matrix 1643 1644 Options Database Keys: 1645 + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree 1646 . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance 1647 . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs 1648 . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs 1649 . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling 1650 . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd() 1651 . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction 1652 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms 1653 1654 Level: intermediate 1655 1656 Note: 1657 Not available in parallel 1658 1659 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()` 1660 @*/ 1661 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) 1662 { 1663 Mat A; 1664 Mat_H2OPUS *h2opus; 1665 MPI_Comm comm; 1666 PetscBool boundtocpu = PETSC_TRUE; 1667 1668 PetscFunctionBegin; 1669 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 1670 PetscValidLogicalCollectiveInt(B, spacedim, 2); 1671 PetscValidLogicalCollectiveBool(B, cdist, 4); 1672 PetscValidLogicalCollectiveReal(B, eta, 5); 1673 PetscValidLogicalCollectiveInt(B, leafsize, 6); 1674 PetscValidLogicalCollectiveInt(B, maxrank, 7); 1675 PetscValidLogicalCollectiveInt(B, bs, 8); 1676 PetscValidLogicalCollectiveReal(B, rtol, 9); 1677 PetscAssertPointer(nA, 10); 1678 PetscCall(PetscObjectGetComm((PetscObject)B, &comm)); 1679 PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 1680 PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 1681 PetscCall(MatCreate(comm, &A)); 1682 PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N)); 1683 #if defined(PETSC_H2OPUS_USE_GPU) 1684 { 1685 VecType vtype; 1686 PetscBool isstd, iscuda, iskok; 1687 1688 PetscCall(MatGetVecType(B, &vtype)); 1689 PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, "")); 1690 PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, "")); 1691 PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, "")); 1692 PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype); 1693 if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE; 1694 if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE; 1695 } 1696 #endif 1697 PetscCall(MatSetType(A, MATH2OPUS)); 1698 PetscCall(MatBindToCPU(A, boundtocpu)); 1699 if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL)); 1700 PetscCall(MatPropagateSymmetryOptions(B, A)); 1701 /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */ 1702 1703 h2opus = (Mat_H2OPUS *)A->data; 1704 h2opus->sampler = new PetscMatrixSampler(B); 1705 if (eta > 0.) h2opus->eta = eta; 1706 if (leafsize > 0) h2opus->leafsize = leafsize; 1707 if (maxrank > 0) h2opus->max_rank = maxrank; 1708 if (bs > 0) h2opus->bs = bs; 1709 if (rtol > 0.) h2opus->rtol = rtol; 1710 *nA = A; 1711 A->preallocated = PETSC_TRUE; 1712 PetscFunctionReturn(PETSC_SUCCESS); 1713 } 1714 1715 /*@ 1716 MatH2OpusGetIndexMap - Access reordering index set. 1717 1718 Input Parameter: 1719 . A - the matrix 1720 1721 Output Parameter: 1722 . indexmap - the index set for the reordering 1723 1724 Level: intermediate 1725 1726 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1727 @*/ 1728 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) 1729 { 1730 PetscBool ish2opus; 1731 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1732 1733 PetscFunctionBegin; 1734 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1735 PetscValidType(A, 1); 1736 PetscAssertPointer(indexmap, 2); 1737 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1738 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1739 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 1740 *indexmap = a->h2opus_indexmap; 1741 PetscFunctionReturn(PETSC_SUCCESS); 1742 } 1743 1744 /*@ 1745 MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering 1746 1747 Input Parameters: 1748 + A - the matrix 1749 . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map 1750 - in - the vector to be mapped 1751 1752 Output Parameter: 1753 . out - the newly created mapped vector 1754 1755 Level: intermediate 1756 1757 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1758 @*/ 1759 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out) 1760 { 1761 PetscBool ish2opus; 1762 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1763 PetscScalar *xin, *xout; 1764 PetscBool nm; 1765 1766 PetscFunctionBegin; 1767 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1768 PetscValidType(A, 1); 1769 PetscValidLogicalCollectiveBool(A, nativetopetsc, 2); 1770 PetscValidHeaderSpecific(in, VEC_CLASSID, 3); 1771 PetscAssertPointer(out, 4); 1772 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1773 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1774 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 1775 nm = a->nativemult; 1776 PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc)); 1777 PetscCall(MatCreateVecs(A, out, NULL)); 1778 PetscCall(MatH2OpusSetNativeMult(A, nm)); 1779 if (!a->sf) { /* same ordering */ 1780 PetscCall(VecCopy(in, *out)); 1781 PetscFunctionReturn(PETSC_SUCCESS); 1782 } 1783 PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin)); 1784 PetscCall(VecGetArrayWrite(*out, &xout)); 1785 if (nativetopetsc) { 1786 PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1787 PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1788 } else { 1789 PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1790 PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1791 } 1792 PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin)); 1793 PetscCall(VecRestoreArrayWrite(*out, &xout)); 1794 PetscFunctionReturn(PETSC_SUCCESS); 1795 } 1796 1797 /*@ 1798 MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $ 1799 1800 Input Parameters: 1801 + A - the hierarchical `MATH2OPUS` matrix 1802 . s - the scaling factor 1803 . U - the dense low-rank update matrix 1804 - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed) 1805 1806 Note: 1807 The `U` and `V` matrices must be in `MATDENSE` dense format 1808 1809 Level: intermediate 1810 1811 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE` 1812 @*/ 1813 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) 1814 { 1815 PetscBool flg; 1816 1817 PetscFunctionBegin; 1818 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1819 PetscValidType(A, 1); 1820 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1821 PetscValidHeaderSpecific(U, MAT_CLASSID, 2); 1822 PetscCheckSameComm(A, 1, U, 2); 1823 if (V) { 1824 PetscValidHeaderSpecific(V, MAT_CLASSID, 3); 1825 PetscCheckSameComm(A, 1, V, 3); 1826 } 1827 PetscValidLogicalCollectiveScalar(A, s, 4); 1828 1829 if (!V) V = U; 1830 PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N); 1831 if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS); 1832 PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg)); 1833 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout"); 1834 PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg)); 1835 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout"); 1836 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg)); 1837 if (flg) { 1838 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1839 const PetscScalar *u, *v, *uu, *vv; 1840 PetscInt ldu, ldv; 1841 PetscMPIInt size; 1842 #if defined(H2OPUS_USE_MPI) 1843 h2opusHandle_t handle = a->handle->handle; 1844 #else 1845 h2opusHandle_t handle = a->handle; 1846 #endif 1847 PetscBool usesf = (PetscBool)(a->sf && !a->nativemult); 1848 PetscSF usf, vsf; 1849 1850 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1851 PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel"); 1852 PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0)); 1853 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, "")); 1854 PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name); 1855 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, "")); 1856 PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name); 1857 PetscCall(MatDenseGetLDA(U, &ldu)); 1858 PetscCall(MatDenseGetLDA(V, &ldv)); 1859 PetscCall(MatBoundToCPU(A, &flg)); 1860 if (usesf) { 1861 PetscInt n; 1862 1863 PetscCall(MatDenseGetH2OpusStridedSF(U, a->sf, &usf)); 1864 PetscCall(MatDenseGetH2OpusStridedSF(V, a->sf, &vsf)); 1865 PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N)); 1866 PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL)); 1867 ldu = n; 1868 ldv = n; 1869 } 1870 if (flg) { 1871 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1872 PetscCall(MatDenseGetArrayRead(U, &u)); 1873 PetscCall(MatDenseGetArrayRead(V, &v)); 1874 if (usesf) { 1875 vv = MatH2OpusGetThrustPointer(*a->yy); 1876 PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1877 PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1878 if (U != V) { 1879 uu = MatH2OpusGetThrustPointer(*a->xx); 1880 PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1881 PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1882 } else uu = vv; 1883 } else { 1884 uu = u; 1885 vv = v; 1886 } 1887 hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle); 1888 PetscCall(MatDenseRestoreArrayRead(U, &u)); 1889 PetscCall(MatDenseRestoreArrayRead(V, &v)); 1890 } else { 1891 #if defined(PETSC_H2OPUS_USE_GPU) 1892 PetscBool flgU, flgV; 1893 1894 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1895 PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, "")); 1896 if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U)); 1897 PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, "")); 1898 if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V)); 1899 PetscCall(MatDenseCUDAGetArrayRead(U, &u)); 1900 PetscCall(MatDenseCUDAGetArrayRead(V, &v)); 1901 if (usesf) { 1902 vv = MatH2OpusGetThrustPointer(*a->yy_gpu); 1903 PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1904 PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1905 if (U != V) { 1906 uu = MatH2OpusGetThrustPointer(*a->xx_gpu); 1907 PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1908 PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1909 } else uu = vv; 1910 } else { 1911 uu = u; 1912 vv = v; 1913 } 1914 #else 1915 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen"); 1916 #endif 1917 hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle); 1918 #if defined(PETSC_H2OPUS_USE_GPU) 1919 PetscCall(MatDenseCUDARestoreArrayRead(U, &u)); 1920 PetscCall(MatDenseCUDARestoreArrayRead(V, &v)); 1921 if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U)); 1922 if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V)); 1923 #endif 1924 } 1925 PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0)); 1926 a->orthogonal = PETSC_FALSE; 1927 } 1928 PetscFunctionReturn(PETSC_SUCCESS); 1929 } 1930 #endif 1931