1 #include <h2opusconf.h> 2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */ 3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU) 4 #include <h2opus.h> 5 #if defined(H2OPUS_USE_MPI) 6 #include <h2opus/distributed/distributed_h2opus_handle.h> 7 #include <h2opus/distributed/distributed_geometric_construction.h> 8 #include <h2opus/distributed/distributed_hgemv.h> 9 #include <h2opus/distributed/distributed_horthog.h> 10 #include <h2opus/distributed/distributed_hcompress.h> 11 #endif 12 #include <h2opus/util/boxentrygen.h> 13 #include <petsc/private/matimpl.h> 14 #include <petsc/private/vecimpl.h> 15 #include <petsc/private/deviceimpl.h> 16 #include <petscsf.h> 17 18 /* math2opusutils */ 19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *); 20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *); 21 PETSC_INTERN PetscErrorCode VecSign(Vec, Vec); 22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt); 23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *); 24 25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data()) 26 27 /* Use GPU only if H2OPUS is configured for GPU */ 28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU) 29 #define PETSC_H2OPUS_USE_GPU 30 #endif 31 #if defined(PETSC_H2OPUS_USE_GPU) 32 #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B))) 33 #else 34 #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS 35 #endif 36 37 // TODO H2OPUS: 38 // DistributedHMatrix 39 // unsymmetric ? 40 // transpose for distributed_hgemv? 41 // clearData() 42 // Unify interface for sequential and parallel? 43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled) 44 // 45 template <class T> 46 class PetscPointCloud : public H2OpusDataSet<T> { 47 private: 48 int dimension; 49 size_t num_points; 50 std::vector<T> pts; 51 52 public: 53 PetscPointCloud(int dim, size_t num_pts, const T coords[]) 54 { 55 dim = dim > 0 ? dim : 1; 56 this->dimension = dim; 57 this->num_points = num_pts; 58 59 pts.resize(num_pts * dim); 60 if (coords) { 61 for (size_t n = 0; n < num_pts; n++) 62 for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i]; 63 } else { 64 PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0; 65 for (size_t n = 0; n < num_pts; n++) { 66 pts[n * dim] = n * h; 67 for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0; 68 } 69 } 70 } 71 72 PetscPointCloud(const PetscPointCloud<T> &other) 73 { 74 size_t N = other.dimension * other.num_points; 75 this->dimension = other.dimension; 76 this->num_points = other.num_points; 77 this->pts.resize(N); 78 for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i]; 79 } 80 81 int getDimension() const { return dimension; } 82 83 size_t getDataSetSize() const { return num_points; } 84 85 T getDataPoint(size_t idx, int dim) const 86 { 87 assert(dim < dimension && idx < num_points); 88 return pts[idx * dimension + dim]; 89 } 90 91 void Print(std::ostream &out = std::cout) 92 { 93 out << "Dimension: " << dimension << std::endl; 94 out << "NumPoints: " << num_points << std::endl; 95 for (size_t n = 0; n < num_points; n++) { 96 for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " "; 97 out << std::endl; 98 } 99 } 100 }; 101 102 template <class T> 103 class PetscFunctionGenerator { 104 private: 105 MatH2OpusKernel k; 106 int dim; 107 void *ctx; 108 109 public: 110 PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx) 111 { 112 this->k = k; 113 this->dim = dim; 114 this->ctx = ctx; 115 } 116 PetscFunctionGenerator(PetscFunctionGenerator &other) 117 { 118 this->k = other.k; 119 this->dim = other.dim; 120 this->ctx = other.ctx; 121 } 122 T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); } 123 }; 124 125 #include <../src/mat/impls/h2opus/math2opussampler.hpp> 126 127 /* just to not clutter the code */ 128 #if !defined(H2OPUS_USE_GPU) 129 typedef HMatrix HMatrix_GPU; 130 #if defined(H2OPUS_USE_MPI) 131 typedef DistributedHMatrix DistributedHMatrix_GPU; 132 #endif 133 #endif 134 135 typedef struct { 136 #if defined(H2OPUS_USE_MPI) 137 distributedH2OpusHandle_t handle; 138 #else 139 h2opusHandle_t handle; 140 #endif 141 /* Sequential and parallel matrices are two different classes at the moment */ 142 HMatrix *hmatrix; 143 #if defined(H2OPUS_USE_MPI) 144 DistributedHMatrix *dist_hmatrix; 145 #else 146 HMatrix *dist_hmatrix; /* just to not clutter the code */ 147 #endif 148 /* May use permutations */ 149 PetscSF sf; 150 PetscLayout h2opus_rmap, h2opus_cmap; 151 IS h2opus_indexmap; 152 thrust::host_vector<PetscScalar> *xx, *yy; 153 PetscInt xxs, yys; 154 PetscBool multsetup; 155 156 /* GPU */ 157 HMatrix_GPU *hmatrix_gpu; 158 #if defined(H2OPUS_USE_MPI) 159 DistributedHMatrix_GPU *dist_hmatrix_gpu; 160 #else 161 HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */ 162 #endif 163 #if defined(PETSC_H2OPUS_USE_GPU) 164 thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu; 165 PetscInt xxs_gpu, yys_gpu; 166 #endif 167 168 /* construction from matvecs */ 169 PetscMatrixSampler *sampler; 170 PetscBool nativemult; 171 172 /* Admissibility */ 173 PetscReal eta; 174 PetscInt leafsize; 175 176 /* for dof reordering */ 177 PetscPointCloud<PetscReal> *ptcloud; 178 179 /* kernel for generating matrix entries */ 180 PetscFunctionGenerator<PetscScalar> *kernel; 181 182 /* basis orthogonalized? */ 183 PetscBool orthogonal; 184 185 /* customization */ 186 PetscInt basisord; 187 PetscInt max_rank; 188 PetscInt bs; 189 PetscReal rtol; 190 PetscInt norm_max_samples; 191 PetscBool check_construction; 192 PetscBool hara_verbose; 193 PetscBool resize; 194 195 /* keeps track of MatScale values */ 196 PetscScalar s; 197 } Mat_H2OPUS; 198 199 static PetscErrorCode MatDestroy_H2OPUS(Mat A) 200 { 201 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 202 203 PetscFunctionBegin; 204 #if defined(H2OPUS_USE_MPI) 205 h2opusDestroyDistributedHandle(a->handle); 206 #else 207 h2opusDestroyHandle(a->handle); 208 #endif 209 delete a->dist_hmatrix; 210 delete a->hmatrix; 211 PetscCall(PetscSFDestroy(&a->sf)); 212 PetscCall(PetscLayoutDestroy(&a->h2opus_rmap)); 213 PetscCall(PetscLayoutDestroy(&a->h2opus_cmap)); 214 PetscCall(ISDestroy(&a->h2opus_indexmap)); 215 delete a->xx; 216 delete a->yy; 217 delete a->hmatrix_gpu; 218 delete a->dist_hmatrix_gpu; 219 #if defined(PETSC_H2OPUS_USE_GPU) 220 delete a->xx_gpu; 221 delete a->yy_gpu; 222 #endif 223 delete a->sampler; 224 delete a->ptcloud; 225 delete a->kernel; 226 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL)); 227 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL)); 228 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL)); 229 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL)); 230 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 231 PetscCall(PetscFree(A->data)); 232 PetscFunctionReturn(PETSC_SUCCESS); 233 } 234 235 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) 236 { 237 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 238 PetscBool ish2opus; 239 240 PetscFunctionBegin; 241 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 242 PetscValidLogicalCollectiveBool(A, nm, 2); 243 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 244 if (ish2opus) { 245 if (a->h2opus_rmap) { /* need to swap layouts for vector creation */ 246 if ((!a->nativemult && nm) || (a->nativemult && !nm)) { 247 PetscLayout t; 248 t = A->rmap; 249 A->rmap = a->h2opus_rmap; 250 a->h2opus_rmap = t; 251 t = A->cmap; 252 A->cmap = a->h2opus_cmap; 253 a->h2opus_cmap = t; 254 } 255 } 256 a->nativemult = nm; 257 } 258 PetscFunctionReturn(PETSC_SUCCESS); 259 } 260 261 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) 262 { 263 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 264 PetscBool ish2opus; 265 266 PetscFunctionBegin; 267 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 268 PetscAssertPointer(nm, 2); 269 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 270 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 271 *nm = a->nativemult; 272 PetscFunctionReturn(PETSC_SUCCESS); 273 } 274 275 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n) 276 { 277 PetscBool ish2opus; 278 PetscInt nmax = PETSC_DECIDE; 279 Mat_H2OPUS *a = NULL; 280 PetscBool mult = PETSC_FALSE; 281 282 PetscFunctionBegin; 283 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 284 if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */ 285 a = (Mat_H2OPUS *)A->data; 286 287 nmax = a->norm_max_samples; 288 mult = a->nativemult; 289 PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE)); 290 } else { 291 PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL)); 292 } 293 PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n)); 294 if (a) PetscCall(MatH2OpusSetNativeMult(A, mult)); 295 PetscFunctionReturn(PETSC_SUCCESS); 296 } 297 298 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) 299 { 300 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 301 PetscInt n; 302 PetscBool boundtocpu = PETSC_TRUE; 303 304 PetscFunctionBegin; 305 #if defined(PETSC_H2OPUS_USE_GPU) 306 boundtocpu = A->boundtocpu; 307 #endif 308 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 309 if (boundtocpu) { 310 if (h2opus->xxs < xN) { 311 h2opus->xx->resize(n * xN); 312 h2opus->xxs = xN; 313 } 314 if (h2opus->yys < yN) { 315 h2opus->yy->resize(n * yN); 316 h2opus->yys = yN; 317 } 318 } 319 #if defined(PETSC_H2OPUS_USE_GPU) 320 if (!boundtocpu) { 321 if (h2opus->xxs_gpu < xN) { 322 h2opus->xx_gpu->resize(n * xN); 323 h2opus->xxs_gpu = xN; 324 } 325 if (h2opus->yys_gpu < yN) { 326 h2opus->yy_gpu->resize(n * yN); 327 h2opus->yys_gpu = yN; 328 } 329 } 330 #endif 331 PetscFunctionReturn(PETSC_SUCCESS); 332 } 333 334 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) 335 { 336 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 337 #if defined(H2OPUS_USE_MPI) 338 h2opusHandle_t handle = h2opus->handle->handle; 339 #else 340 h2opusHandle_t handle = h2opus->handle; 341 #endif 342 PetscBool boundtocpu = PETSC_TRUE; 343 PetscScalar *xx, *yy, *uxx, *uyy; 344 PetscInt blda, clda; 345 PetscMPIInt size; 346 PetscSF bsf, csf; 347 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 348 349 PetscFunctionBegin; 350 HLibProfile::clear(); 351 #if defined(PETSC_H2OPUS_USE_GPU) 352 boundtocpu = A->boundtocpu; 353 #endif 354 PetscCall(MatDenseGetLDA(B, &blda)); 355 PetscCall(MatDenseGetLDA(C, &clda)); 356 if (usesf) { 357 PetscInt n; 358 359 PetscCall(MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf)); 360 PetscCall(MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf)); 361 362 PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N)); 363 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 364 blda = n; 365 clda = n; 366 } 367 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 368 if (boundtocpu) { 369 PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx)); 370 PetscCall(MatDenseGetArrayWrite(C, &yy)); 371 if (usesf) { 372 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 373 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 374 PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 375 PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 376 } else { 377 uxx = xx; 378 uyy = yy; 379 } 380 if (size > 1) { 381 PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 382 PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 383 #if defined(H2OPUS_USE_MPI) 384 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 385 #endif 386 } else { 387 PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 388 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 389 } 390 PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx)); 391 if (usesf) { 392 PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 393 PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 394 } 395 PetscCall(MatDenseRestoreArrayWrite(C, &yy)); 396 #if defined(PETSC_H2OPUS_USE_GPU) 397 } else { 398 PetscBool ciscuda, biscuda; 399 400 /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */ 401 PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 402 if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B)); 403 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 404 if (!ciscuda) { 405 C->assembled = PETSC_TRUE; 406 PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C)); 407 } 408 PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx)); 409 PetscCall(MatDenseCUDAGetArrayWrite(C, &yy)); 410 if (usesf) { 411 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 412 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 413 PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 414 PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 415 } else { 416 uxx = xx; 417 uyy = yy; 418 } 419 PetscCall(PetscLogGpuTimeBegin()); 420 if (size > 1) { 421 PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix"); 422 PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 423 #if defined(H2OPUS_USE_MPI) 424 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 425 #endif 426 } else { 427 PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 428 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 429 } 430 PetscCall(PetscLogGpuTimeEnd()); 431 PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx)); 432 if (usesf) { 433 PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 434 PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 435 } 436 PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy)); 437 if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B)); 438 if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C)); 439 #endif 440 } 441 { /* log flops */ 442 double gops, time, perf, dev; 443 HLibProfile::getHgemvPerf(gops, time, perf, dev); 444 #if defined(PETSC_H2OPUS_USE_GPU) 445 if (boundtocpu) { 446 PetscCall(PetscLogFlops(1e9 * gops)); 447 } else { 448 PetscCall(PetscLogGpuFlops(1e9 * gops)); 449 } 450 #else 451 PetscCall(PetscLogFlops(1e9 * gops)); 452 #endif 453 } 454 PetscFunctionReturn(PETSC_SUCCESS); 455 } 456 457 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) 458 { 459 Mat_Product *product = C->product; 460 461 PetscFunctionBegin; 462 MatCheckProduct(C, 1); 463 switch (product->type) { 464 case MATPRODUCT_AB: 465 PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C)); 466 break; 467 case MATPRODUCT_AtB: 468 PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C)); 469 break; 470 default: 471 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]); 472 } 473 PetscFunctionReturn(PETSC_SUCCESS); 474 } 475 476 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) 477 { 478 Mat_Product *product = C->product; 479 PetscBool cisdense; 480 Mat A, B; 481 482 PetscFunctionBegin; 483 MatCheckProduct(C, 1); 484 A = product->A; 485 B = product->B; 486 switch (product->type) { 487 case MATPRODUCT_AB: 488 PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N)); 489 PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B)); 490 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 491 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name)); 492 PetscCall(MatSetUp(C)); 493 break; 494 case MATPRODUCT_AtB: 495 PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N)); 496 PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B)); 497 PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 498 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name)); 499 PetscCall(MatSetUp(C)); 500 break; 501 default: 502 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]); 503 } 504 C->ops->productsymbolic = NULL; 505 C->ops->productnumeric = MatProductNumeric_H2OPUS; 506 PetscFunctionReturn(PETSC_SUCCESS); 507 } 508 509 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) 510 { 511 PetscFunctionBegin; 512 MatCheckProduct(C, 1); 513 if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS; 514 PetscFunctionReturn(PETSC_SUCCESS); 515 } 516 517 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) 518 { 519 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 520 #if defined(H2OPUS_USE_MPI) 521 h2opusHandle_t handle = h2opus->handle->handle; 522 #else 523 h2opusHandle_t handle = h2opus->handle; 524 #endif 525 PetscBool boundtocpu = PETSC_TRUE; 526 PetscInt n; 527 PetscScalar *xx, *yy, *uxx, *uyy; 528 PetscMPIInt size; 529 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 530 531 PetscFunctionBegin; 532 HLibProfile::clear(); 533 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 534 #if defined(PETSC_H2OPUS_USE_GPU) 535 boundtocpu = A->boundtocpu; 536 #endif 537 if (usesf) { 538 PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 539 } else n = A->rmap->n; 540 if (boundtocpu) { 541 PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx)); 542 if (sy == 0.0) { 543 PetscCall(VecGetArrayWrite(y, &yy)); 544 } else { 545 PetscCall(VecGetArray(y, &yy)); 546 } 547 if (usesf) { 548 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 549 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 550 551 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 552 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 553 if (sy != 0.0) { 554 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 555 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 556 } 557 } else { 558 uxx = xx; 559 uyy = yy; 560 } 561 if (size > 1) { 562 PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 563 PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 564 #if defined(H2OPUS_USE_MPI) 565 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle); 566 #endif 567 } else { 568 PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 569 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle); 570 } 571 PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx)); 572 if (usesf) { 573 PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 574 PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 575 } 576 if (sy == 0.0) { 577 PetscCall(VecRestoreArrayWrite(y, &yy)); 578 } else { 579 PetscCall(VecRestoreArray(y, &yy)); 580 } 581 #if defined(PETSC_H2OPUS_USE_GPU) 582 } else { 583 PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx)); 584 if (sy == 0.0) { 585 PetscCall(VecCUDAGetArrayWrite(y, &yy)); 586 } else { 587 PetscCall(VecCUDAGetArray(y, &yy)); 588 } 589 if (usesf) { 590 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 591 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 592 593 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 594 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 595 if (sy != 0.0) { 596 PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 597 PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 598 } 599 } else { 600 uxx = xx; 601 uyy = yy; 602 } 603 PetscCall(PetscLogGpuTimeBegin()); 604 if (size > 1) { 605 PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix"); 606 PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 607 #if defined(H2OPUS_USE_MPI) 608 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle); 609 #endif 610 } else { 611 PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 612 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle); 613 } 614 PetscCall(PetscLogGpuTimeEnd()); 615 PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx)); 616 if (usesf) { 617 PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 618 PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 619 } 620 if (sy == 0.0) { 621 PetscCall(VecCUDARestoreArrayWrite(y, &yy)); 622 } else { 623 PetscCall(VecCUDARestoreArray(y, &yy)); 624 } 625 #endif 626 } 627 { /* log flops */ 628 double gops, time, perf, dev; 629 HLibProfile::getHgemvPerf(gops, time, perf, dev); 630 #if defined(PETSC_H2OPUS_USE_GPU) 631 if (boundtocpu) { 632 PetscCall(PetscLogFlops(1e9 * gops)); 633 } else { 634 PetscCall(PetscLogGpuFlops(1e9 * gops)); 635 } 636 #else 637 PetscCall(PetscLogFlops(1e9 * gops)); 638 #endif 639 } 640 PetscFunctionReturn(PETSC_SUCCESS); 641 } 642 643 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) 644 { 645 PetscBool xiscuda, yiscuda; 646 647 PetscFunctionBegin; 648 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 649 PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "")); 650 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda)); 651 PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE)); 652 PetscFunctionReturn(PETSC_SUCCESS); 653 } 654 655 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) 656 { 657 PetscBool xiscuda, yiscuda; 658 659 PetscFunctionBegin; 660 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 661 PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "")); 662 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda)); 663 PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE)); 664 PetscFunctionReturn(PETSC_SUCCESS); 665 } 666 667 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 668 { 669 PetscBool xiscuda, ziscuda; 670 671 PetscFunctionBegin; 672 PetscCall(VecCopy(y, z)); 673 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 674 PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "")); 675 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda)); 676 PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE)); 677 PetscFunctionReturn(PETSC_SUCCESS); 678 } 679 680 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 681 { 682 PetscBool xiscuda, ziscuda; 683 684 PetscFunctionBegin; 685 PetscCall(VecCopy(y, z)); 686 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 687 PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "")); 688 PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda)); 689 PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE)); 690 PetscFunctionReturn(PETSC_SUCCESS); 691 } 692 693 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) 694 { 695 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 696 697 PetscFunctionBegin; 698 a->s *= s; 699 PetscFunctionReturn(PETSC_SUCCESS); 700 } 701 702 static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject) 703 { 704 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 705 706 PetscFunctionBegin; 707 PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options"); 708 PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL)); 709 PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL)); 710 PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL)); 711 PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL)); 712 PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL)); 713 PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL)); 714 PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL)); 715 PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL)); 716 PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL)); 717 PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL)); 718 PetscOptionsHeadEnd(); 719 PetscFunctionReturn(PETSC_SUCCESS); 720 } 721 722 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *); 723 724 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) 725 { 726 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 727 Vec c; 728 PetscInt spacedim; 729 const PetscScalar *coords; 730 731 PetscFunctionBegin; 732 if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS); 733 PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c)); 734 if (!c && a->sampler) { 735 Mat S = a->sampler->GetSamplingMat(); 736 737 PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c)); 738 } 739 if (!c) { 740 PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL)); 741 } else { 742 PetscCall(VecGetArrayRead(c, &coords)); 743 PetscCall(VecGetBlockSize(c, &spacedim)); 744 PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL)); 745 PetscCall(VecRestoreArrayRead(c, &coords)); 746 } 747 PetscFunctionReturn(PETSC_SUCCESS); 748 } 749 750 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) 751 { 752 MPI_Comm comm; 753 PetscMPIInt size; 754 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 755 PetscInt n = 0, *idx = NULL; 756 int *iidx = NULL; 757 PetscCopyMode own; 758 PetscBool rid; 759 760 PetscFunctionBegin; 761 if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS); 762 if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */ 763 PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL)); 764 #if defined(PETSC_H2OPUS_USE_GPU) 765 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 766 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 767 a->xxs_gpu = 1; 768 a->yys_gpu = 1; 769 #endif 770 a->xx = new thrust::host_vector<PetscScalar>(n); 771 a->yy = new thrust::host_vector<PetscScalar>(n); 772 a->xxs = 1; 773 a->yys = 1; 774 } else { 775 IS is; 776 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 777 PetscCallMPI(MPI_Comm_size(comm, &size)); 778 if (!a->h2opus_indexmap) { 779 if (size > 1) { 780 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 781 #if defined(H2OPUS_USE_MPI) 782 iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map); 783 n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size(); 784 #endif 785 } else { 786 iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map); 787 n = a->hmatrix->u_basis_tree.index_map.size(); 788 } 789 790 if (PetscDefined(USE_64BIT_INDICES)) { 791 PetscInt i; 792 793 own = PETSC_OWN_POINTER; 794 PetscCall(PetscMalloc1(n, &idx)); 795 for (i = 0; i < n; i++) idx[i] = iidx[i]; 796 } else { 797 own = PETSC_COPY_VALUES; 798 idx = (PetscInt *)iidx; 799 } 800 PetscCall(ISCreateGeneral(comm, n, idx, own, &is)); 801 PetscCall(ISSetPermutation(is)); 802 PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view")); 803 a->h2opus_indexmap = is; 804 } 805 PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n)); 806 PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx)); 807 rid = (PetscBool)(n == A->rmap->n); 808 PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm)); 809 if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid)); 810 if (!rid) { 811 if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */ 812 PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap)); 813 PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n)); 814 PetscCall(PetscLayoutSetUp(a->h2opus_rmap)); 815 PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap)); 816 } 817 PetscCall(PetscSFCreate(comm, &a->sf)); 818 PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx)); 819 PetscCall(PetscSFSetUp(a->sf)); 820 PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view")); 821 #if defined(PETSC_H2OPUS_USE_GPU) 822 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 823 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 824 a->xxs_gpu = 1; 825 a->yys_gpu = 1; 826 #endif 827 a->xx = new thrust::host_vector<PetscScalar>(n); 828 a->yy = new thrust::host_vector<PetscScalar>(n); 829 a->xxs = 1; 830 a->yys = 1; 831 } 832 PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx)); 833 } 834 a->multsetup = PETSC_TRUE; 835 PetscFunctionReturn(PETSC_SUCCESS); 836 } 837 838 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) 839 { 840 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 841 #if defined(H2OPUS_USE_MPI) 842 h2opusHandle_t handle = a->handle->handle; 843 #else 844 h2opusHandle_t handle = a->handle; 845 #endif 846 PetscBool kernel = PETSC_FALSE; 847 PetscBool boundtocpu = PETSC_TRUE; 848 PetscBool samplingdone = PETSC_FALSE; 849 MPI_Comm comm; 850 PetscMPIInt size; 851 852 PetscFunctionBegin; 853 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 854 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 855 PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 856 857 /* XXX */ 858 a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N)); 859 860 PetscCallMPI(MPI_Comm_size(comm, &size)); 861 /* TODO REUSABILITY of geometric construction */ 862 delete a->hmatrix; 863 delete a->dist_hmatrix; 864 #if defined(PETSC_H2OPUS_USE_GPU) 865 delete a->hmatrix_gpu; 866 delete a->dist_hmatrix_gpu; 867 #endif 868 a->orthogonal = PETSC_FALSE; 869 870 /* TODO: other? */ 871 H2OpusBoxCenterAdmissibility adm(a->eta); 872 873 PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0)); 874 if (size > 1) { 875 #if defined(H2OPUS_USE_MPI) 876 a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */); 877 #else 878 a->dist_hmatrix = NULL; 879 #endif 880 } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE); 881 PetscCall(MatH2OpusInferCoordinates_Private(A)); 882 PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud"); 883 if (a->kernel) { 884 BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel); 885 if (size > 1) { 886 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 887 #if defined(H2OPUS_USE_MPI) 888 buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle); 889 #endif 890 } else { 891 buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord); 892 } 893 kernel = PETSC_TRUE; 894 } else { 895 PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel"); 896 buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm); 897 } 898 PetscCall(MatSetUpMultiply_H2OPUS(A)); 899 900 #if defined(PETSC_H2OPUS_USE_GPU) 901 boundtocpu = A->boundtocpu; 902 if (!boundtocpu) { 903 if (size > 1) { 904 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 905 #if defined(H2OPUS_USE_MPI) 906 a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 907 #endif 908 } else { 909 a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 910 } 911 } 912 #endif 913 if (size == 1) { 914 if (!kernel && a->sampler && a->sampler->GetSamplingMat()) { 915 PetscReal Anorm; 916 bool verbose; 917 918 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL)); 919 verbose = a->hara_verbose; 920 PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm)); 921 if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs)); 922 if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data()); 923 a->sampler->SetStream(handle->getMainStream()); 924 if (boundtocpu) { 925 a->sampler->SetGPUSampling(false); 926 hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose); 927 #if defined(PETSC_H2OPUS_USE_GPU) 928 } else { 929 a->sampler->SetGPUSampling(true); 930 hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose); 931 #endif 932 } 933 samplingdone = PETSC_TRUE; 934 } 935 } 936 #if defined(PETSC_H2OPUS_USE_GPU) 937 if (!boundtocpu) { 938 delete a->hmatrix; 939 delete a->dist_hmatrix; 940 a->hmatrix = NULL; 941 a->dist_hmatrix = NULL; 942 } 943 A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 944 #endif 945 PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0)); 946 947 if (!a->s) a->s = 1.0; 948 A->assembled = PETSC_TRUE; 949 950 if (samplingdone) { 951 PetscBool check = a->check_construction; 952 PetscBool checke = PETSC_FALSE; 953 954 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL)); 955 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL)); 956 if (check) { 957 Mat E, Ae; 958 PetscReal n1, ni, n2; 959 PetscReal n1A, niA, n2A; 960 void (*normfunc)(void); 961 962 Ae = a->sampler->GetSamplingMat(); 963 PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E)); 964 PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS)); 965 PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN)); 966 PetscCall(MatNorm(E, NORM_1, &n1)); 967 PetscCall(MatNorm(E, NORM_INFINITY, &ni)); 968 PetscCall(MatNorm(E, NORM_2, &n2)); 969 if (checke) { 970 Mat eA, eE, eAe; 971 972 PetscCall(MatComputeOperator(A, MATAIJ, &eA)); 973 PetscCall(MatComputeOperator(E, MATAIJ, &eE)); 974 PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe)); 975 PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE)); 976 PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE)); 977 PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE)); 978 PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat")); 979 PetscCall(MatView(eA, NULL)); 980 PetscCall(PetscObjectSetName((PetscObject)eAe, "S")); 981 PetscCall(MatView(eAe, NULL)); 982 PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S")); 983 PetscCall(MatView(eE, NULL)); 984 PetscCall(MatDestroy(&eA)); 985 PetscCall(MatDestroy(&eE)); 986 PetscCall(MatDestroy(&eAe)); 987 } 988 989 PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc)); 990 PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS)); 991 PetscCall(MatNorm(Ae, NORM_1, &n1A)); 992 PetscCall(MatNorm(Ae, NORM_INFINITY, &niA)); 993 PetscCall(MatNorm(Ae, NORM_2, &n2A)); 994 n1A = PetscMax(n1A, PETSC_SMALL); 995 n2A = PetscMax(n2A, PETSC_SMALL); 996 niA = PetscMax(niA, PETSC_SMALL); 997 PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc)); 998 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A))); 999 PetscCall(MatDestroy(&E)); 1000 } 1001 a->sampler->SetSamplingMat(NULL); 1002 } 1003 PetscFunctionReturn(PETSC_SUCCESS); 1004 } 1005 1006 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) 1007 { 1008 PetscMPIInt size; 1009 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1010 1011 PetscFunctionBegin; 1012 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1013 PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported"); 1014 a->hmatrix->clearData(); 1015 #if defined(PETSC_H2OPUS_USE_GPU) 1016 if (a->hmatrix_gpu) a->hmatrix_gpu->clearData(); 1017 #endif 1018 PetscFunctionReturn(PETSC_SUCCESS); 1019 } 1020 1021 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) 1022 { 1023 Mat A; 1024 Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data; 1025 #if defined(PETSC_H2OPUS_USE_GPU) 1026 PetscBool iscpu = PETSC_FALSE; 1027 #else 1028 PetscBool iscpu = PETSC_TRUE; 1029 #endif 1030 MPI_Comm comm; 1031 1032 PetscFunctionBegin; 1033 PetscCall(PetscObjectGetComm((PetscObject)B, &comm)); 1034 PetscCall(MatCreate(comm, &A)); 1035 PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N)); 1036 PetscCall(MatSetType(A, MATH2OPUS)); 1037 PetscCall(MatPropagateSymmetryOptions(B, A)); 1038 a = (Mat_H2OPUS *)A->data; 1039 1040 a->eta = b->eta; 1041 a->leafsize = b->leafsize; 1042 a->basisord = b->basisord; 1043 a->max_rank = b->max_rank; 1044 a->bs = b->bs; 1045 a->rtol = b->rtol; 1046 a->norm_max_samples = b->norm_max_samples; 1047 if (op == MAT_COPY_VALUES) a->s = b->s; 1048 1049 a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud); 1050 if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel); 1051 1052 #if defined(H2OPUS_USE_MPI) 1053 if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); 1054 #if defined(PETSC_H2OPUS_USE_GPU) 1055 if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); 1056 #endif 1057 #endif 1058 if (b->hmatrix) { 1059 a->hmatrix = new HMatrix(*b->hmatrix); 1060 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData(); 1061 } 1062 #if defined(PETSC_H2OPUS_USE_GPU) 1063 if (b->hmatrix_gpu) { 1064 a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu); 1065 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData(); 1066 } 1067 #endif 1068 if (b->sf) { 1069 PetscCall(PetscObjectReference((PetscObject)b->sf)); 1070 a->sf = b->sf; 1071 } 1072 if (b->h2opus_indexmap) { 1073 PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap)); 1074 a->h2opus_indexmap = b->h2opus_indexmap; 1075 } 1076 1077 PetscCall(MatSetUp(A)); 1078 PetscCall(MatSetUpMultiply_H2OPUS(A)); 1079 if (op == MAT_COPY_VALUES) { 1080 A->assembled = PETSC_TRUE; 1081 a->orthogonal = b->orthogonal; 1082 #if defined(PETSC_H2OPUS_USE_GPU) 1083 A->offloadmask = B->offloadmask; 1084 #endif 1085 } 1086 #if defined(PETSC_H2OPUS_USE_GPU) 1087 iscpu = B->boundtocpu; 1088 #endif 1089 PetscCall(MatBindToCPU(A, iscpu)); 1090 1091 *nA = A; 1092 PetscFunctionReturn(PETSC_SUCCESS); 1093 } 1094 1095 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) 1096 { 1097 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 1098 PetscBool isascii, vieweps; 1099 PetscMPIInt size; 1100 PetscViewerFormat format; 1101 1102 PetscFunctionBegin; 1103 PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii)); 1104 PetscCall(PetscViewerGetFormat(view, &format)); 1105 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1106 if (isascii) { 1107 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1108 if (size == 1) { 1109 FILE *fp; 1110 PetscCall(PetscViewerASCIIGetPointer(view, &fp)); 1111 dumpHMatrix(*h2opus->hmatrix, 6, fp); 1112 } 1113 } else { 1114 PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat")); 1115 PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0)); 1116 PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta)); 1117 if (!h2opus->kernel) { 1118 PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol)); 1119 } else { 1120 PetscCall(PetscViewerASCIIPrintf(view, " Offdiagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord)); 1121 } 1122 PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples)); 1123 if (size == 1) { 1124 double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0; 1125 double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0; 1126 #if defined(PETSC_HAVE_CUDA) 1127 double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0; 1128 double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0; 1129 #endif 1130 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu)); 1131 #if defined(PETSC_HAVE_CUDA) 1132 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu)); 1133 #endif 1134 } else { 1135 #if defined(PETSC_HAVE_CUDA) 1136 double matrix_mem[4] = {0., 0., 0., 0.}; 1137 PetscMPIInt rsize = 4; 1138 #else 1139 double matrix_mem[2] = {0., 0.}; 1140 PetscMPIInt rsize = 2; 1141 #endif 1142 #if defined(H2OPUS_USE_MPI) 1143 matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0; 1144 matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0; 1145 #if defined(PETSC_HAVE_CUDA) 1146 matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0; 1147 matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0; 1148 #endif 1149 #endif 1150 PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A))); 1151 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1])); 1152 #if defined(PETSC_HAVE_CUDA) 1153 PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3])); 1154 #endif 1155 } 1156 } 1157 } 1158 vieweps = PETSC_FALSE; 1159 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL)); 1160 if (vieweps) { 1161 char filename[256]; 1162 const char *name; 1163 1164 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1165 PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name)); 1166 PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL)); 1167 outputEps(*h2opus->hmatrix, filename); 1168 } 1169 PetscFunctionReturn(PETSC_SUCCESS); 1170 } 1171 1172 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx) 1173 { 1174 Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 1175 PetscReal *gcoords; 1176 PetscInt N; 1177 MPI_Comm comm; 1178 PetscMPIInt size; 1179 PetscBool cong; 1180 1181 PetscFunctionBegin; 1182 PetscCall(PetscLayoutSetUp(A->rmap)); 1183 PetscCall(PetscLayoutSetUp(A->cmap)); 1184 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1185 PetscCall(MatHasCongruentLayouts(A, &cong)); 1186 PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts"); 1187 N = A->rmap->N; 1188 PetscCallMPI(MPI_Comm_size(comm, &size)); 1189 if (spacedim > 0 && size > 1 && cdist) { 1190 PetscSF sf; 1191 MPI_Datatype dtype; 1192 1193 PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype)); 1194 PetscCallMPI(MPI_Type_commit(&dtype)); 1195 1196 PetscCall(PetscSFCreate(comm, &sf)); 1197 PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER)); 1198 PetscCall(PetscMalloc1(spacedim * N, &gcoords)); 1199 PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE)); 1200 PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE)); 1201 PetscCall(PetscSFDestroy(&sf)); 1202 PetscCallMPI(MPI_Type_free(&dtype)); 1203 } else gcoords = (PetscReal *)coords; 1204 1205 delete h2opus->ptcloud; 1206 delete h2opus->kernel; 1207 h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords); 1208 if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx); 1209 if (gcoords != coords) PetscCall(PetscFree(gcoords)); 1210 A->preallocated = PETSC_TRUE; 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 #if defined(PETSC_H2OPUS_USE_GPU) 1215 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) 1216 { 1217 PetscMPIInt size; 1218 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1219 1220 PetscFunctionBegin; 1221 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1222 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) { 1223 if (size > 1) { 1224 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1225 #if defined(H2OPUS_USE_MPI) 1226 if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu); 1227 else *a->dist_hmatrix = *a->dist_hmatrix_gpu; 1228 #endif 1229 } else { 1230 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1231 if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu); 1232 else *a->hmatrix = *a->hmatrix_gpu; 1233 } 1234 delete a->hmatrix_gpu; 1235 delete a->dist_hmatrix_gpu; 1236 a->hmatrix_gpu = NULL; 1237 a->dist_hmatrix_gpu = NULL; 1238 A->offloadmask = PETSC_OFFLOAD_CPU; 1239 } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) { 1240 if (size > 1) { 1241 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1242 #if defined(H2OPUS_USE_MPI) 1243 if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 1244 else *a->dist_hmatrix_gpu = *a->dist_hmatrix; 1245 #endif 1246 } else { 1247 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1248 if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 1249 else *a->hmatrix_gpu = *a->hmatrix; 1250 } 1251 delete a->hmatrix; 1252 delete a->dist_hmatrix; 1253 a->hmatrix = NULL; 1254 a->dist_hmatrix = NULL; 1255 A->offloadmask = PETSC_OFFLOAD_GPU; 1256 } 1257 PetscCall(PetscFree(A->defaultvectype)); 1258 if (!flg) { 1259 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 1260 } else { 1261 PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype)); 1262 } 1263 A->boundtocpu = flg; 1264 PetscFunctionReturn(PETSC_SUCCESS); 1265 } 1266 #endif 1267 1268 /*MC 1269 MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package. 1270 1271 Options Database Key: 1272 . -mat_type h2opus - matrix type to "h2opus" 1273 1274 Level: beginner 1275 1276 Notes: 1277 H2Opus implements hierarchical matrices in the H^2 flavour. It supports CPU or NVIDIA GPUs. 1278 1279 For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus. 1280 In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`. 1281 1282 Reference: 1283 . * - "H2Opus: A distributed-memory multi-GPU software package for non-local operators", https://arxiv.org/abs/2109.05451 1284 1285 .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()` 1286 M*/ 1287 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) 1288 { 1289 Mat_H2OPUS *a; 1290 PetscMPIInt size; 1291 1292 PetscFunctionBegin; 1293 #if defined(PETSC_H2OPUS_USE_GPU) 1294 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1295 #endif 1296 PetscCall(PetscNew(&a)); 1297 A->data = (void *)a; 1298 1299 a->eta = 0.9; 1300 a->leafsize = 32; 1301 a->basisord = 4; 1302 a->max_rank = 64; 1303 a->bs = 32; 1304 a->rtol = 1.e-4; 1305 a->s = 1.0; 1306 a->norm_max_samples = 10; 1307 a->resize = PETSC_TRUE; /* reallocate after compression */ 1308 #if defined(H2OPUS_USE_MPI) 1309 h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A)); 1310 #else 1311 h2opusCreateHandle(&a->handle); 1312 #endif 1313 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1314 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS)); 1315 PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps))); 1316 1317 A->ops->destroy = MatDestroy_H2OPUS; 1318 A->ops->view = MatView_H2OPUS; 1319 A->ops->assemblyend = MatAssemblyEnd_H2OPUS; 1320 A->ops->mult = MatMult_H2OPUS; 1321 A->ops->multtranspose = MatMultTranspose_H2OPUS; 1322 A->ops->multadd = MatMultAdd_H2OPUS; 1323 A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS; 1324 A->ops->scale = MatScale_H2OPUS; 1325 A->ops->duplicate = MatDuplicate_H2OPUS; 1326 A->ops->setfromoptions = MatSetFromOptions_H2OPUS; 1327 A->ops->norm = MatNorm_H2OPUS; 1328 A->ops->zeroentries = MatZeroEntries_H2OPUS; 1329 #if defined(PETSC_H2OPUS_USE_GPU) 1330 A->ops->bindtocpu = MatBindToCPU_H2OPUS; 1331 #endif 1332 1333 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS)); 1334 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS)); 1335 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS)); 1336 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS)); 1337 #if defined(PETSC_H2OPUS_USE_GPU) 1338 PetscCall(PetscFree(A->defaultvectype)); 1339 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 1340 #endif 1341 PetscFunctionReturn(PETSC_SUCCESS); 1342 } 1343 1344 /*@C 1345 MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix. 1346 1347 Input Parameter: 1348 . A - the matrix 1349 1350 Level: intermediate 1351 1352 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()` 1353 @*/ 1354 PetscErrorCode MatH2OpusOrthogonalize(Mat A) 1355 { 1356 PetscBool ish2opus; 1357 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1358 PetscMPIInt size; 1359 PetscBool boundtocpu = PETSC_TRUE; 1360 1361 PetscFunctionBegin; 1362 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1363 PetscValidType(A, 1); 1364 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1365 if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS); 1366 if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS); 1367 HLibProfile::clear(); 1368 PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0)); 1369 #if defined(PETSC_H2OPUS_USE_GPU) 1370 boundtocpu = A->boundtocpu; 1371 #endif 1372 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1373 if (size > 1) { 1374 if (boundtocpu) { 1375 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1376 #if defined(H2OPUS_USE_MPI) 1377 distributed_horthog(*a->dist_hmatrix, a->handle); 1378 #endif 1379 #if defined(PETSC_H2OPUS_USE_GPU) 1380 A->offloadmask = PETSC_OFFLOAD_CPU; 1381 } else { 1382 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1383 PetscCall(PetscLogGpuTimeBegin()); 1384 #if defined(H2OPUS_USE_MPI) 1385 distributed_horthog(*a->dist_hmatrix_gpu, a->handle); 1386 #endif 1387 PetscCall(PetscLogGpuTimeEnd()); 1388 #endif 1389 } 1390 } else { 1391 #if defined(H2OPUS_USE_MPI) 1392 h2opusHandle_t handle = a->handle->handle; 1393 #else 1394 h2opusHandle_t handle = a->handle; 1395 #endif 1396 if (boundtocpu) { 1397 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1398 horthog(*a->hmatrix, handle); 1399 #if defined(PETSC_H2OPUS_USE_GPU) 1400 A->offloadmask = PETSC_OFFLOAD_CPU; 1401 } else { 1402 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1403 PetscCall(PetscLogGpuTimeBegin()); 1404 horthog(*a->hmatrix_gpu, handle); 1405 PetscCall(PetscLogGpuTimeEnd()); 1406 #endif 1407 } 1408 } 1409 a->orthogonal = PETSC_TRUE; 1410 { /* log flops */ 1411 double gops, time, perf, dev; 1412 HLibProfile::getHorthogPerf(gops, time, perf, dev); 1413 #if defined(PETSC_H2OPUS_USE_GPU) 1414 if (boundtocpu) { 1415 PetscCall(PetscLogFlops(1e9 * gops)); 1416 } else { 1417 PetscCall(PetscLogGpuFlops(1e9 * gops)); 1418 } 1419 #else 1420 PetscCall(PetscLogFlops(1e9 * gops)); 1421 #endif 1422 } 1423 PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0)); 1424 PetscFunctionReturn(PETSC_SUCCESS); 1425 } 1426 1427 /*@C 1428 MatH2OpusCompress - Compress a hierarchical matrix. 1429 1430 Input Parameters: 1431 + A - the matrix 1432 - tol - the absolute truncation threshold 1433 1434 Level: intermediate 1435 1436 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()` 1437 @*/ 1438 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) 1439 { 1440 PetscBool ish2opus; 1441 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1442 PetscMPIInt size; 1443 PetscBool boundtocpu = PETSC_TRUE; 1444 1445 PetscFunctionBegin; 1446 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1447 PetscValidType(A, 1); 1448 PetscValidLogicalCollectiveReal(A, tol, 2); 1449 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1450 if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS); 1451 PetscCall(MatH2OpusOrthogonalize(A)); 1452 HLibProfile::clear(); 1453 PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0)); 1454 #if defined(PETSC_H2OPUS_USE_GPU) 1455 boundtocpu = A->boundtocpu; 1456 #endif 1457 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1458 if (size > 1) { 1459 if (boundtocpu) { 1460 PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1461 #if defined(H2OPUS_USE_MPI) 1462 distributed_hcompress(*a->dist_hmatrix, tol, a->handle); 1463 if (a->resize) { 1464 DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix); 1465 delete a->dist_hmatrix; 1466 a->dist_hmatrix = dist_hmatrix; 1467 } 1468 #endif 1469 #if defined(PETSC_H2OPUS_USE_GPU) 1470 A->offloadmask = PETSC_OFFLOAD_CPU; 1471 } else { 1472 PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1473 PetscCall(PetscLogGpuTimeBegin()); 1474 #if defined(H2OPUS_USE_MPI) 1475 distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle); 1476 1477 if (a->resize) { 1478 DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu); 1479 delete a->dist_hmatrix_gpu; 1480 a->dist_hmatrix_gpu = dist_hmatrix_gpu; 1481 } 1482 #endif 1483 PetscCall(PetscLogGpuTimeEnd()); 1484 #endif 1485 } 1486 } else { 1487 #if defined(H2OPUS_USE_MPI) 1488 h2opusHandle_t handle = a->handle->handle; 1489 #else 1490 h2opusHandle_t handle = a->handle; 1491 #endif 1492 if (boundtocpu) { 1493 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1494 hcompress(*a->hmatrix, tol, handle); 1495 1496 if (a->resize) { 1497 HMatrix *hmatrix = new HMatrix(*a->hmatrix); 1498 delete a->hmatrix; 1499 a->hmatrix = hmatrix; 1500 } 1501 #if defined(PETSC_H2OPUS_USE_GPU) 1502 A->offloadmask = PETSC_OFFLOAD_CPU; 1503 } else { 1504 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1505 PetscCall(PetscLogGpuTimeBegin()); 1506 hcompress(*a->hmatrix_gpu, tol, handle); 1507 PetscCall(PetscLogGpuTimeEnd()); 1508 1509 if (a->resize) { 1510 HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu); 1511 delete a->hmatrix_gpu; 1512 a->hmatrix_gpu = hmatrix_gpu; 1513 } 1514 #endif 1515 } 1516 } 1517 { /* log flops */ 1518 double gops, time, perf, dev; 1519 HLibProfile::getHcompressPerf(gops, time, perf, dev); 1520 #if defined(PETSC_H2OPUS_USE_GPU) 1521 if (boundtocpu) { 1522 PetscCall(PetscLogFlops(1e9 * gops)); 1523 } else { 1524 PetscCall(PetscLogGpuFlops(1e9 * gops)); 1525 } 1526 #else 1527 PetscCall(PetscLogFlops(1e9 * gops)); 1528 #endif 1529 } 1530 PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0)); 1531 PetscFunctionReturn(PETSC_SUCCESS); 1532 } 1533 1534 /*@C 1535 MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix. 1536 1537 Input Parameters: 1538 + A - the hierarchical matrix 1539 . B - the matrix to be sampled 1540 . bs - maximum number of samples to be taken concurrently 1541 - tol - relative tolerance for construction 1542 1543 Notes: 1544 You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix. 1545 1546 Level: intermediate 1547 1548 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()` 1549 @*/ 1550 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) 1551 { 1552 PetscBool ish2opus; 1553 1554 PetscFunctionBegin; 1555 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1556 PetscValidType(A, 1); 1557 if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 1558 PetscValidLogicalCollectiveInt(A, bs, 3); 1559 PetscValidLogicalCollectiveReal(A, tol, 3); 1560 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1561 if (ish2opus) { 1562 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1563 1564 if (!a->sampler) a->sampler = new PetscMatrixSampler(); 1565 a->sampler->SetSamplingMat(B); 1566 if (bs > 0) a->bs = bs; 1567 if (tol > 0.) a->rtol = tol; 1568 delete a->kernel; 1569 } 1570 PetscFunctionReturn(PETSC_SUCCESS); 1571 } 1572 1573 /*@C 1574 MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel. 1575 1576 Input Parameters: 1577 + comm - MPI communicator 1578 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 1579 . n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given) 1580 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 1581 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 1582 . spacedim - dimension of the space coordinates 1583 . coords - coordinates of the points 1584 . cdist - whether or not coordinates are distributed 1585 . kernel - computational kernel (or `NULL`) 1586 . kernelctx - kernel context 1587 . eta - admissibility condition tolerance 1588 . leafsize - leaf size in cluster tree 1589 - basisord - approximation order for Chebychev interpolation of low-rank blocks 1590 1591 Output Parameter: 1592 . nA - matrix 1593 1594 Options Database Keys: 1595 + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree 1596 . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance 1597 . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order 1598 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms 1599 1600 Level: intermediate 1601 1602 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()` 1603 @*/ 1604 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA) 1605 { 1606 Mat A; 1607 Mat_H2OPUS *h2opus; 1608 #if defined(PETSC_H2OPUS_USE_GPU) 1609 PetscBool iscpu = PETSC_FALSE; 1610 #else 1611 PetscBool iscpu = PETSC_TRUE; 1612 #endif 1613 1614 PetscFunctionBegin; 1615 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 1616 PetscCall(MatCreate(comm, &A)); 1617 PetscCall(MatSetSizes(A, m, n, M, N)); 1618 PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 1619 PetscCall(MatSetType(A, MATH2OPUS)); 1620 PetscCall(MatBindToCPU(A, iscpu)); 1621 PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx)); 1622 1623 h2opus = (Mat_H2OPUS *)A->data; 1624 if (eta > 0.) h2opus->eta = eta; 1625 if (leafsize > 0) h2opus->leafsize = leafsize; 1626 if (basisord > 0) h2opus->basisord = basisord; 1627 1628 *nA = A; 1629 PetscFunctionReturn(PETSC_SUCCESS); 1630 } 1631 1632 /*@C 1633 MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator. 1634 1635 Input Parameters: 1636 + B - the matrix to be sampled 1637 . spacedim - dimension of the space coordinates 1638 . coords - coordinates of the points 1639 . cdist - whether or not coordinates are distributed 1640 . eta - admissibility condition tolerance 1641 . leafsize - leaf size in cluster tree 1642 . maxrank - maximum rank allowed 1643 . bs - maximum number of samples to be taken concurrently 1644 - rtol - relative tolerance for construction 1645 1646 Output Parameter: 1647 . nA - matrix 1648 1649 Options Database Keys: 1650 + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree 1651 . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance 1652 . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs 1653 . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs 1654 . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling 1655 . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd() 1656 . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction 1657 - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms 1658 1659 Level: intermediate 1660 1661 Note: 1662 Not available in parallel 1663 1664 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()` 1665 @*/ 1666 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) 1667 { 1668 Mat A; 1669 Mat_H2OPUS *h2opus; 1670 MPI_Comm comm; 1671 PetscBool boundtocpu = PETSC_TRUE; 1672 1673 PetscFunctionBegin; 1674 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 1675 PetscValidLogicalCollectiveInt(B, spacedim, 2); 1676 PetscValidLogicalCollectiveBool(B, cdist, 4); 1677 PetscValidLogicalCollectiveReal(B, eta, 5); 1678 PetscValidLogicalCollectiveInt(B, leafsize, 6); 1679 PetscValidLogicalCollectiveInt(B, maxrank, 7); 1680 PetscValidLogicalCollectiveInt(B, bs, 8); 1681 PetscValidLogicalCollectiveReal(B, rtol, 9); 1682 PetscAssertPointer(nA, 10); 1683 PetscCall(PetscObjectGetComm((PetscObject)B, &comm)); 1684 PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 1685 PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 1686 PetscCall(MatCreate(comm, &A)); 1687 PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N)); 1688 #if defined(PETSC_H2OPUS_USE_GPU) 1689 { 1690 VecType vtype; 1691 PetscBool isstd, iscuda, iskok; 1692 1693 PetscCall(MatGetVecType(B, &vtype)); 1694 PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, "")); 1695 PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, "")); 1696 PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, "")); 1697 PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype); 1698 if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE; 1699 if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE; 1700 } 1701 #endif 1702 PetscCall(MatSetType(A, MATH2OPUS)); 1703 PetscCall(MatBindToCPU(A, boundtocpu)); 1704 if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL)); 1705 PetscCall(MatPropagateSymmetryOptions(B, A)); 1706 /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */ 1707 1708 h2opus = (Mat_H2OPUS *)A->data; 1709 h2opus->sampler = new PetscMatrixSampler(B); 1710 if (eta > 0.) h2opus->eta = eta; 1711 if (leafsize > 0) h2opus->leafsize = leafsize; 1712 if (maxrank > 0) h2opus->max_rank = maxrank; 1713 if (bs > 0) h2opus->bs = bs; 1714 if (rtol > 0.) h2opus->rtol = rtol; 1715 *nA = A; 1716 A->preallocated = PETSC_TRUE; 1717 PetscFunctionReturn(PETSC_SUCCESS); 1718 } 1719 1720 /*@C 1721 MatH2OpusGetIndexMap - Access reordering index set. 1722 1723 Input Parameter: 1724 . A - the matrix 1725 1726 Output Parameter: 1727 . indexmap - the index set for the reordering 1728 1729 Level: intermediate 1730 1731 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1732 @*/ 1733 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) 1734 { 1735 PetscBool ish2opus; 1736 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1737 1738 PetscFunctionBegin; 1739 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1740 PetscValidType(A, 1); 1741 PetscAssertPointer(indexmap, 2); 1742 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1743 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1744 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 1745 *indexmap = a->h2opus_indexmap; 1746 PetscFunctionReturn(PETSC_SUCCESS); 1747 } 1748 1749 /*@C 1750 MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering 1751 1752 Input Parameters: 1753 + A - the matrix 1754 . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map 1755 - in - the vector to be mapped 1756 1757 Output Parameter: 1758 . out - the newly created mapped vector 1759 1760 Level: intermediate 1761 1762 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1763 @*/ 1764 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out) 1765 { 1766 PetscBool ish2opus; 1767 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1768 PetscScalar *xin, *xout; 1769 PetscBool nm; 1770 1771 PetscFunctionBegin; 1772 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1773 PetscValidType(A, 1); 1774 PetscValidLogicalCollectiveBool(A, nativetopetsc, 2); 1775 PetscValidHeaderSpecific(in, VEC_CLASSID, 3); 1776 PetscAssertPointer(out, 4); 1777 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1778 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1779 PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 1780 nm = a->nativemult; 1781 PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc)); 1782 PetscCall(MatCreateVecs(A, out, NULL)); 1783 PetscCall(MatH2OpusSetNativeMult(A, nm)); 1784 if (!a->sf) { /* same ordering */ 1785 PetscCall(VecCopy(in, *out)); 1786 PetscFunctionReturn(PETSC_SUCCESS); 1787 } 1788 PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin)); 1789 PetscCall(VecGetArrayWrite(*out, &xout)); 1790 if (nativetopetsc) { 1791 PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1792 PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1793 } else { 1794 PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1795 PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1796 } 1797 PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin)); 1798 PetscCall(VecRestoreArrayWrite(*out, &xout)); 1799 PetscFunctionReturn(PETSC_SUCCESS); 1800 } 1801 1802 /*@C 1803 MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T 1804 1805 Input Parameters: 1806 + A - the hierarchical `MATH2OPUS` matrix 1807 . s - the scaling factor 1808 . U - the dense low-rank update matrix 1809 - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed) 1810 1811 Note: 1812 The `U` and `V` matrices must be in dense format 1813 1814 Level: intermediate 1815 1816 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE` 1817 @*/ 1818 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) 1819 { 1820 PetscBool flg; 1821 1822 PetscFunctionBegin; 1823 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1824 PetscValidType(A, 1); 1825 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1826 PetscValidHeaderSpecific(U, MAT_CLASSID, 2); 1827 PetscCheckSameComm(A, 1, U, 2); 1828 if (V) { 1829 PetscValidHeaderSpecific(V, MAT_CLASSID, 3); 1830 PetscCheckSameComm(A, 1, V, 3); 1831 } 1832 PetscValidLogicalCollectiveScalar(A, s, 4); 1833 1834 if (!V) V = U; 1835 PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N); 1836 if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS); 1837 PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg)); 1838 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout"); 1839 PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg)); 1840 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout"); 1841 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg)); 1842 if (flg) { 1843 Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1844 const PetscScalar *u, *v, *uu, *vv; 1845 PetscInt ldu, ldv; 1846 PetscMPIInt size; 1847 #if defined(H2OPUS_USE_MPI) 1848 h2opusHandle_t handle = a->handle->handle; 1849 #else 1850 h2opusHandle_t handle = a->handle; 1851 #endif 1852 PetscBool usesf = (PetscBool)(a->sf && !a->nativemult); 1853 PetscSF usf, vsf; 1854 1855 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1856 PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel"); 1857 PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0)); 1858 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, "")); 1859 PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name); 1860 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, "")); 1861 PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name); 1862 PetscCall(MatDenseGetLDA(U, &ldu)); 1863 PetscCall(MatDenseGetLDA(V, &ldv)); 1864 PetscCall(MatBoundToCPU(A, &flg)); 1865 if (usesf) { 1866 PetscInt n; 1867 1868 PetscCall(MatDenseGetH2OpusVectorSF(U, a->sf, &usf)); 1869 PetscCall(MatDenseGetH2OpusVectorSF(V, a->sf, &vsf)); 1870 PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N)); 1871 PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL)); 1872 ldu = n; 1873 ldv = n; 1874 } 1875 if (flg) { 1876 PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1877 PetscCall(MatDenseGetArrayRead(U, &u)); 1878 PetscCall(MatDenseGetArrayRead(V, &v)); 1879 if (usesf) { 1880 vv = MatH2OpusGetThrustPointer(*a->yy); 1881 PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1882 PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1883 if (U != V) { 1884 uu = MatH2OpusGetThrustPointer(*a->xx); 1885 PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1886 PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1887 } else uu = vv; 1888 } else { 1889 uu = u; 1890 vv = v; 1891 } 1892 hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle); 1893 PetscCall(MatDenseRestoreArrayRead(U, &u)); 1894 PetscCall(MatDenseRestoreArrayRead(V, &v)); 1895 } else { 1896 #if defined(PETSC_H2OPUS_USE_GPU) 1897 PetscBool flgU, flgV; 1898 1899 PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1900 PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, "")); 1901 if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U)); 1902 PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, "")); 1903 if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V)); 1904 PetscCall(MatDenseCUDAGetArrayRead(U, &u)); 1905 PetscCall(MatDenseCUDAGetArrayRead(V, &v)); 1906 if (usesf) { 1907 vv = MatH2OpusGetThrustPointer(*a->yy_gpu); 1908 PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1909 PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1910 if (U != V) { 1911 uu = MatH2OpusGetThrustPointer(*a->xx_gpu); 1912 PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1913 PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1914 } else uu = vv; 1915 } else { 1916 uu = u; 1917 vv = v; 1918 } 1919 #else 1920 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen"); 1921 #endif 1922 hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle); 1923 #if defined(PETSC_H2OPUS_USE_GPU) 1924 PetscCall(MatDenseCUDARestoreArrayRead(U, &u)); 1925 PetscCall(MatDenseCUDARestoreArrayRead(V, &v)); 1926 if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U)); 1927 if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V)); 1928 #endif 1929 } 1930 PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0)); 1931 a->orthogonal = PETSC_FALSE; 1932 } 1933 PetscFunctionReturn(PETSC_SUCCESS); 1934 } 1935 #endif 1936