1 #include <h2opusconf.h> 2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */ 3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU) 4 #include <h2opus.h> 5 #if defined(H2OPUS_USE_MPI) 6 #include <h2opus/distributed/distributed_h2opus_handle.h> 7 #include <h2opus/distributed/distributed_geometric_construction.h> 8 #include <h2opus/distributed/distributed_hgemv.h> 9 #include <h2opus/distributed/distributed_horthog.h> 10 #include <h2opus/distributed/distributed_hcompress.h> 11 #endif 12 #include <h2opus/util/boxentrygen.h> 13 #include <petsc/private/matimpl.h> 14 #include <petsc/private/vecimpl.h> 15 #include <petsc/private/deviceimpl.h> 16 #include <petscsf.h> 17 18 /* math2opusutils */ 19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*); 20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*); 21 PETSC_INTERN PetscErrorCode VecSign(Vec,Vec); 22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt); 23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*); 24 25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data()) 26 27 /* Use GPU only if H2OPUS is configured for GPU */ 28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU) 29 #define PETSC_H2OPUS_USE_GPU 30 #endif 31 #if defined(PETSC_H2OPUS_USE_GPU) 32 #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B))) 33 #else 34 #define MatH2OpusUpdateIfNeeded(A,B) 0 35 #endif 36 37 // TODO H2OPUS: 38 // DistributedHMatrix 39 // unsymmetric ? 40 // transpose for distributed_hgemv? 41 // clearData() 42 // Unify interface for sequential and parallel? 43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled) 44 // 45 template <class T> class PetscPointCloud : public H2OpusDataSet<T> 46 { 47 private: 48 int dimension; 49 size_t num_points; 50 std::vector<T> pts; 51 52 public: 53 PetscPointCloud(int dim, size_t num_pts, const T coords[]) 54 { 55 dim = dim > 0 ? dim : 1; 56 this->dimension = dim; 57 this->num_points = num_pts; 58 59 pts.resize(num_pts*dim); 60 if (coords) { 61 for (size_t n = 0; n < num_pts; n++) 62 for (int i = 0; i < dim; i++) 63 pts[n*dim + i] = coords[n*dim + i]; 64 } else { 65 PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0; 66 for (size_t n = 0; n < num_pts; n++) { 67 pts[n*dim] = n*h; 68 for (int i = 1; i < dim; i++) 69 pts[n*dim + i] = 0.0; 70 } 71 } 72 } 73 74 PetscPointCloud(const PetscPointCloud<T>& other) 75 { 76 size_t N = other.dimension * other.num_points; 77 this->dimension = other.dimension; 78 this->num_points = other.num_points; 79 this->pts.resize(N); 80 for (size_t i = 0; i < N; i++) 81 this->pts[i] = other.pts[i]; 82 } 83 84 int getDimension() const 85 { 86 return dimension; 87 } 88 89 size_t getDataSetSize() const 90 { 91 return num_points; 92 } 93 94 T getDataPoint(size_t idx, int dim) const 95 { 96 assert(dim < dimension && idx < num_points); 97 return pts[idx*dimension + dim]; 98 } 99 100 void Print(std::ostream& out = std::cout) 101 { 102 out << "Dimension: " << dimension << std::endl; 103 out << "NumPoints: " << num_points << std::endl; 104 for (size_t n = 0; n < num_points; n++) { 105 for (int d = 0; d < dimension; d++) 106 out << pts[n*dimension + d] << " "; 107 out << std::endl; 108 } 109 } 110 }; 111 112 template<class T> class PetscFunctionGenerator 113 { 114 private: 115 MatH2OpusKernel k; 116 int dim; 117 void *ctx; 118 119 public: 120 PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; } 121 PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; } 122 T operator()(PetscReal *pt1, PetscReal *pt2) 123 { 124 return (T)((*this->k)(this->dim,pt1,pt2,this->ctx)); 125 } 126 }; 127 128 #include <../src/mat/impls/h2opus/math2opussampler.hpp> 129 130 /* just to not clutter the code */ 131 #if !defined(H2OPUS_USE_GPU) 132 typedef HMatrix HMatrix_GPU; 133 #if defined(H2OPUS_USE_MPI) 134 typedef DistributedHMatrix DistributedHMatrix_GPU; 135 #endif 136 #endif 137 138 typedef struct { 139 #if defined(H2OPUS_USE_MPI) 140 distributedH2OpusHandle_t handle; 141 #else 142 h2opusHandle_t handle; 143 #endif 144 /* Sequential and parallel matrices are two different classes at the moment */ 145 HMatrix *hmatrix; 146 #if defined(H2OPUS_USE_MPI) 147 DistributedHMatrix *dist_hmatrix; 148 #else 149 HMatrix *dist_hmatrix; /* just to not clutter the code */ 150 #endif 151 /* May use permutations */ 152 PetscSF sf; 153 PetscLayout h2opus_rmap, h2opus_cmap; 154 IS h2opus_indexmap; 155 thrust::host_vector<PetscScalar> *xx,*yy; 156 PetscInt xxs,yys; 157 PetscBool multsetup; 158 159 /* GPU */ 160 HMatrix_GPU *hmatrix_gpu; 161 #if defined(H2OPUS_USE_MPI) 162 DistributedHMatrix_GPU *dist_hmatrix_gpu; 163 #else 164 HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */ 165 #endif 166 #if defined(PETSC_H2OPUS_USE_GPU) 167 thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu; 168 PetscInt xxs_gpu,yys_gpu; 169 #endif 170 171 /* construction from matvecs */ 172 PetscMatrixSampler* sampler; 173 PetscBool nativemult; 174 175 /* Admissibility */ 176 PetscReal eta; 177 PetscInt leafsize; 178 179 /* for dof reordering */ 180 PetscPointCloud<PetscReal> *ptcloud; 181 182 /* kernel for generating matrix entries */ 183 PetscFunctionGenerator<PetscScalar> *kernel; 184 185 /* basis orthogonalized? */ 186 PetscBool orthogonal; 187 188 /* customization */ 189 PetscInt basisord; 190 PetscInt max_rank; 191 PetscInt bs; 192 PetscReal rtol; 193 PetscInt norm_max_samples; 194 PetscBool check_construction; 195 PetscBool hara_verbose; 196 PetscBool resize; 197 198 /* keeps track of MatScale values */ 199 PetscScalar s; 200 } Mat_H2OPUS; 201 202 static PetscErrorCode MatDestroy_H2OPUS(Mat A) 203 { 204 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 205 206 PetscFunctionBegin; 207 #if defined(H2OPUS_USE_MPI) 208 h2opusDestroyDistributedHandle(a->handle); 209 #else 210 h2opusDestroyHandle(a->handle); 211 #endif 212 delete a->dist_hmatrix; 213 delete a->hmatrix; 214 PetscCall(PetscSFDestroy(&a->sf)); 215 PetscCall(PetscLayoutDestroy(&a->h2opus_rmap)); 216 PetscCall(PetscLayoutDestroy(&a->h2opus_cmap)); 217 PetscCall(ISDestroy(&a->h2opus_indexmap)); 218 delete a->xx; 219 delete a->yy; 220 delete a->hmatrix_gpu; 221 delete a->dist_hmatrix_gpu; 222 #if defined(PETSC_H2OPUS_USE_GPU) 223 delete a->xx_gpu; 224 delete a->yy_gpu; 225 #endif 226 delete a->sampler; 227 delete a->ptcloud; 228 delete a->kernel; 229 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL)); 230 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL)); 231 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL)); 232 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL)); 233 PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 234 PetscCall(PetscFree(A->data)); 235 PetscFunctionReturn(0); 236 } 237 238 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) 239 { 240 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 241 PetscBool ish2opus; 242 243 PetscFunctionBegin; 244 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 245 PetscValidLogicalCollectiveBool(A,nm,2); 246 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 247 if (ish2opus) { 248 if (a->h2opus_rmap) { /* need to swap layouts for vector creation */ 249 if ((!a->nativemult && nm) || (a->nativemult && !nm)) { 250 PetscLayout t; 251 t = A->rmap; 252 A->rmap = a->h2opus_rmap; 253 a->h2opus_rmap = t; 254 t = A->cmap; 255 A->cmap = a->h2opus_cmap; 256 a->h2opus_cmap = t; 257 } 258 } 259 a->nativemult = nm; 260 } 261 PetscFunctionReturn(0); 262 } 263 264 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) 265 { 266 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 267 PetscBool ish2opus; 268 269 PetscFunctionBegin; 270 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 271 PetscValidPointer(nm,2); 272 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 273 PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 274 *nm = a->nativemult; 275 PetscFunctionReturn(0); 276 } 277 278 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n) 279 { 280 PetscBool ish2opus; 281 PetscInt nmax = PETSC_DECIDE; 282 Mat_H2OPUS *a = NULL; 283 PetscBool mult = PETSC_FALSE; 284 285 PetscFunctionBegin; 286 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 287 if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */ 288 a = (Mat_H2OPUS*)A->data; 289 290 nmax = a->norm_max_samples; 291 mult = a->nativemult; 292 PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE)); 293 } else { 294 PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL)); 295 } 296 PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n)); 297 if (a) PetscCall(MatH2OpusSetNativeMult(A,mult)); 298 PetscFunctionReturn(0); 299 } 300 301 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) 302 { 303 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 304 PetscInt n; 305 PetscBool boundtocpu = PETSC_TRUE; 306 307 PetscFunctionBegin; 308 #if defined(PETSC_H2OPUS_USE_GPU) 309 boundtocpu = A->boundtocpu; 310 #endif 311 PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 312 if (boundtocpu) { 313 if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; } 314 if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; } 315 } 316 #if defined(PETSC_H2OPUS_USE_GPU) 317 if (!boundtocpu) { 318 if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; } 319 if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; } 320 } 321 #endif 322 PetscFunctionReturn(0); 323 } 324 325 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) 326 { 327 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 328 #if defined(H2OPUS_USE_MPI) 329 h2opusHandle_t handle = h2opus->handle->handle; 330 #else 331 h2opusHandle_t handle = h2opus->handle; 332 #endif 333 PetscBool boundtocpu = PETSC_TRUE; 334 PetscScalar *xx,*yy,*uxx,*uyy; 335 PetscInt blda,clda; 336 PetscMPIInt size; 337 PetscSF bsf,csf; 338 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 339 340 PetscFunctionBegin; 341 HLibProfile::clear(); 342 #if defined(PETSC_H2OPUS_USE_GPU) 343 boundtocpu = A->boundtocpu; 344 #endif 345 PetscCall(MatDenseGetLDA(B,&blda)); 346 PetscCall(MatDenseGetLDA(C,&clda)); 347 if (usesf) { 348 PetscInt n; 349 350 PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf)); 351 PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf)); 352 353 PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N)); 354 PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 355 blda = n; 356 clda = n; 357 } 358 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 359 if (boundtocpu) { 360 PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx)); 361 PetscCall(MatDenseGetArrayWrite(C,&yy)); 362 if (usesf) { 363 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 364 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 365 PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 366 PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 367 } else { 368 uxx = xx; 369 uyy = yy; 370 } 371 if (size > 1) { 372 PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 373 PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 374 #if defined(H2OPUS_USE_MPI) 375 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 376 #endif 377 } else { 378 PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 379 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 380 } 381 PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx)); 382 if (usesf) { 383 PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 384 PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 385 } 386 PetscCall(MatDenseRestoreArrayWrite(C,&yy)); 387 #if defined(PETSC_H2OPUS_USE_GPU) 388 } else { 389 PetscBool ciscuda,biscuda; 390 391 /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */ 392 PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 393 if (!biscuda) { 394 PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B)); 395 } 396 PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 397 if (!ciscuda) { 398 C->assembled = PETSC_TRUE; 399 PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C)); 400 } 401 PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx)); 402 PetscCall(MatDenseCUDAGetArrayWrite(C,&yy)); 403 if (usesf) { 404 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 405 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 406 PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 407 PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 408 } else { 409 uxx = xx; 410 uyy = yy; 411 } 412 PetscCall(PetscLogGpuTimeBegin()); 413 if (size > 1) { 414 PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix"); 415 PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 416 #if defined(H2OPUS_USE_MPI) 417 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 418 #endif 419 } else { 420 PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 421 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 422 } 423 PetscCall(PetscLogGpuTimeEnd()); 424 PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx)); 425 if (usesf) { 426 PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 427 PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 428 } 429 PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy)); 430 if (!biscuda) { 431 PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B)); 432 } 433 if (!ciscuda) { 434 PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C)); 435 } 436 #endif 437 } 438 { /* log flops */ 439 double gops,time,perf,dev; 440 HLibProfile::getHgemvPerf(gops,time,perf,dev); 441 #if defined(PETSC_H2OPUS_USE_GPU) 442 if (boundtocpu) { 443 PetscCall(PetscLogFlops(1e9*gops)); 444 } else { 445 PetscCall(PetscLogGpuFlops(1e9*gops)); 446 } 447 #else 448 PetscCall(PetscLogFlops(1e9*gops)); 449 #endif 450 } 451 PetscFunctionReturn(0); 452 } 453 454 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) 455 { 456 Mat_Product *product = C->product; 457 458 PetscFunctionBegin; 459 MatCheckProduct(C,1); 460 switch (product->type) { 461 case MATPRODUCT_AB: 462 PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C)); 463 break; 464 case MATPRODUCT_AtB: 465 PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C)); 466 break; 467 default: 468 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]); 469 } 470 PetscFunctionReturn(0); 471 } 472 473 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) 474 { 475 Mat_Product *product = C->product; 476 PetscBool cisdense; 477 Mat A,B; 478 479 PetscFunctionBegin; 480 MatCheckProduct(C,1); 481 A = product->A; 482 B = product->B; 483 switch (product->type) { 484 case MATPRODUCT_AB: 485 PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N)); 486 PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B)); 487 PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 488 if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name)); 489 PetscCall(MatSetUp(C)); 490 break; 491 case MATPRODUCT_AtB: 492 PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N)); 493 PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B)); 494 PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 495 if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name)); 496 PetscCall(MatSetUp(C)); 497 break; 498 default: 499 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]); 500 } 501 C->ops->productsymbolic = NULL; 502 C->ops->productnumeric = MatProductNumeric_H2OPUS; 503 PetscFunctionReturn(0); 504 } 505 506 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) 507 { 508 PetscFunctionBegin; 509 MatCheckProduct(C,1); 510 if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) { 511 C->ops->productsymbolic = MatProductSymbolic_H2OPUS; 512 } 513 PetscFunctionReturn(0); 514 } 515 516 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) 517 { 518 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 519 #if defined(H2OPUS_USE_MPI) 520 h2opusHandle_t handle = h2opus->handle->handle; 521 #else 522 h2opusHandle_t handle = h2opus->handle; 523 #endif 524 PetscBool boundtocpu = PETSC_TRUE; 525 PetscInt n; 526 PetscScalar *xx,*yy,*uxx,*uyy; 527 PetscMPIInt size; 528 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 529 530 PetscFunctionBegin; 531 HLibProfile::clear(); 532 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 533 #if defined(PETSC_H2OPUS_USE_GPU) 534 boundtocpu = A->boundtocpu; 535 #endif 536 if (usesf) { 537 PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 538 } else n = A->rmap->n; 539 if (boundtocpu) { 540 PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx)); 541 if (sy == 0.0) { 542 PetscCall(VecGetArrayWrite(y,&yy)); 543 } else { 544 PetscCall(VecGetArray(y,&yy)); 545 } 546 if (usesf) { 547 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 548 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 549 550 PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 551 PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 552 if (sy != 0.0) { 553 PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 554 PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 555 } 556 } else { 557 uxx = xx; 558 uyy = yy; 559 } 560 if (size > 1) { 561 PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 562 PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 563 #if defined(H2OPUS_USE_MPI) 564 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle); 565 #endif 566 } else { 567 PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 568 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle); 569 } 570 PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx)); 571 if (usesf) { 572 PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 573 PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 574 } 575 if (sy == 0.0) { 576 PetscCall(VecRestoreArrayWrite(y,&yy)); 577 } else { 578 PetscCall(VecRestoreArray(y,&yy)); 579 } 580 #if defined(PETSC_H2OPUS_USE_GPU) 581 } else { 582 PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx)); 583 if (sy == 0.0) { 584 PetscCall(VecCUDAGetArrayWrite(y,&yy)); 585 } else { 586 PetscCall(VecCUDAGetArray(y,&yy)); 587 } 588 if (usesf) { 589 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 590 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 591 592 PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 593 PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 594 if (sy != 0.0) { 595 PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 596 PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 597 } 598 } else { 599 uxx = xx; 600 uyy = yy; 601 } 602 PetscCall(PetscLogGpuTimeBegin()); 603 if (size > 1) { 604 PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix"); 605 PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 606 #if defined(H2OPUS_USE_MPI) 607 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle); 608 #endif 609 } else { 610 PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 611 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle); 612 } 613 PetscCall(PetscLogGpuTimeEnd()); 614 PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx)); 615 if (usesf) { 616 PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 617 PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 618 } 619 if (sy == 0.0) { 620 PetscCall(VecCUDARestoreArrayWrite(y,&yy)); 621 } else { 622 PetscCall(VecCUDARestoreArray(y,&yy)); 623 } 624 #endif 625 } 626 { /* log flops */ 627 double gops,time,perf,dev; 628 HLibProfile::getHgemvPerf(gops,time,perf,dev); 629 #if defined(PETSC_H2OPUS_USE_GPU) 630 if (boundtocpu) { 631 PetscCall(PetscLogFlops(1e9*gops)); 632 } else { 633 PetscCall(PetscLogGpuFlops(1e9*gops)); 634 } 635 #else 636 PetscCall(PetscLogFlops(1e9*gops)); 637 #endif 638 } 639 PetscFunctionReturn(0); 640 } 641 642 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) 643 { 644 PetscBool xiscuda,yiscuda; 645 646 PetscFunctionBegin; 647 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 648 PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"")); 649 PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda)); 650 PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE)); 651 PetscFunctionReturn(0); 652 } 653 654 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) 655 { 656 PetscBool xiscuda,yiscuda; 657 658 PetscFunctionBegin; 659 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 660 PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"")); 661 PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda)); 662 PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE)); 663 PetscFunctionReturn(0); 664 } 665 666 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 667 { 668 PetscBool xiscuda,ziscuda; 669 670 PetscFunctionBegin; 671 PetscCall(VecCopy(y,z)); 672 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 673 PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"")); 674 PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda)); 675 PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE)); 676 PetscFunctionReturn(0); 677 } 678 679 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 680 { 681 PetscBool xiscuda,ziscuda; 682 683 PetscFunctionBegin; 684 PetscCall(VecCopy(y,z)); 685 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 686 PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"")); 687 PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda)); 688 PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE)); 689 PetscFunctionReturn(0); 690 } 691 692 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) 693 { 694 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 695 696 PetscFunctionBegin; 697 a->s *= s; 698 PetscFunctionReturn(0); 699 } 700 701 static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A) 702 { 703 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 704 705 PetscFunctionBegin; 706 PetscOptionsHeadBegin(PetscOptionsObject,"H2OPUS options"); 707 PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL)); 708 PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL)); 709 PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL)); 710 PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL)); 711 PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL)); 712 PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL)); 713 PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL)); 714 PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL)); 715 PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL)); 716 PetscCall(PetscOptionsBool("-mat_h2opus_resize","Resize after compression",NULL,a->resize,&a->resize,NULL)); 717 PetscOptionsHeadEnd(); 718 PetscFunctionReturn(0); 719 } 720 721 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*); 722 723 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) 724 { 725 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 726 Vec c; 727 PetscInt spacedim; 728 const PetscScalar *coords; 729 730 PetscFunctionBegin; 731 if (a->ptcloud) PetscFunctionReturn(0); 732 PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c)); 733 if (!c && a->sampler) { 734 Mat S = a->sampler->GetSamplingMat(); 735 736 PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c)); 737 } 738 if (!c) { 739 PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL)); 740 } else { 741 PetscCall(VecGetArrayRead(c,&coords)); 742 PetscCall(VecGetBlockSize(c,&spacedim)); 743 PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL)); 744 PetscCall(VecRestoreArrayRead(c,&coords)); 745 } 746 PetscFunctionReturn(0); 747 } 748 749 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) 750 { 751 MPI_Comm comm; 752 PetscMPIInt size; 753 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 754 PetscInt n = 0,*idx = NULL; 755 int *iidx = NULL; 756 PetscCopyMode own; 757 PetscBool rid; 758 759 PetscFunctionBegin; 760 if (a->multsetup) PetscFunctionReturn(0); 761 if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */ 762 PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL)); 763 #if defined(PETSC_H2OPUS_USE_GPU) 764 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 765 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 766 a->xxs_gpu = 1; 767 a->yys_gpu = 1; 768 #endif 769 a->xx = new thrust::host_vector<PetscScalar>(n); 770 a->yy = new thrust::host_vector<PetscScalar>(n); 771 a->xxs = 1; 772 a->yys = 1; 773 } else { 774 IS is; 775 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 776 PetscCallMPI(MPI_Comm_size(comm,&size)); 777 if (!a->h2opus_indexmap) { 778 if (size > 1) { 779 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 780 #if defined(H2OPUS_USE_MPI) 781 iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map); 782 n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size(); 783 #endif 784 } else { 785 iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map); 786 n = a->hmatrix->u_basis_tree.index_map.size(); 787 } 788 789 if (PetscDefined(USE_64BIT_INDICES)) { 790 PetscInt i; 791 792 own = PETSC_OWN_POINTER; 793 PetscCall(PetscMalloc1(n,&idx)); 794 for (i=0;i<n;i++) idx[i] = iidx[i]; 795 } else { 796 own = PETSC_COPY_VALUES; 797 idx = (PetscInt*)iidx; 798 } 799 PetscCall(ISCreateGeneral(comm,n,idx,own,&is)); 800 PetscCall(ISSetPermutation(is)); 801 PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view")); 802 a->h2opus_indexmap = is; 803 } 804 PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n)); 805 PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx)); 806 rid = (PetscBool)(n == A->rmap->n); 807 PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm)); 808 if (rid) { 809 PetscCall(ISIdentity(a->h2opus_indexmap,&rid)); 810 } 811 if (!rid) { 812 if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */ 813 PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap)); 814 PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n)); 815 PetscCall(PetscLayoutSetUp(a->h2opus_rmap)); 816 PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap)); 817 } 818 PetscCall(PetscSFCreate(comm,&a->sf)); 819 PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx)); 820 PetscCall(PetscSFSetUp(a->sf)); 821 PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view")); 822 #if defined(PETSC_H2OPUS_USE_GPU) 823 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 824 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 825 a->xxs_gpu = 1; 826 a->yys_gpu = 1; 827 #endif 828 a->xx = new thrust::host_vector<PetscScalar>(n); 829 a->yy = new thrust::host_vector<PetscScalar>(n); 830 a->xxs = 1; 831 a->yys = 1; 832 } 833 PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx)); 834 } 835 a->multsetup = PETSC_TRUE; 836 PetscFunctionReturn(0); 837 } 838 839 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) 840 { 841 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 842 #if defined(H2OPUS_USE_MPI) 843 h2opusHandle_t handle = a->handle->handle; 844 #else 845 h2opusHandle_t handle = a->handle; 846 #endif 847 PetscBool kernel = PETSC_FALSE; 848 PetscBool boundtocpu = PETSC_TRUE; 849 PetscBool samplingdone = PETSC_FALSE; 850 MPI_Comm comm; 851 PetscMPIInt size; 852 853 PetscFunctionBegin; 854 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 855 PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 856 PetscCheck(A->rmap->N == A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 857 858 /* XXX */ 859 a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N)); 860 861 PetscCallMPI(MPI_Comm_size(comm,&size)); 862 /* TODO REUSABILITY of geometric construction */ 863 delete a->hmatrix; 864 delete a->dist_hmatrix; 865 #if defined(PETSC_H2OPUS_USE_GPU) 866 delete a->hmatrix_gpu; 867 delete a->dist_hmatrix_gpu; 868 #endif 869 a->orthogonal = PETSC_FALSE; 870 871 /* TODO: other? */ 872 H2OpusBoxCenterAdmissibility adm(a->eta); 873 874 PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0)); 875 if (size > 1) { 876 #if defined(H2OPUS_USE_MPI) 877 a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */); 878 #else 879 a->dist_hmatrix = NULL; 880 #endif 881 } else { 882 a->hmatrix = new HMatrix(A->rmap->n,A->symmetric); 883 } 884 PetscCall(MatH2OpusInferCoordinates_Private(A)); 885 PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud"); 886 if (a->kernel) { 887 BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel); 888 if (size > 1) { 889 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 890 #if defined(H2OPUS_USE_MPI) 891 buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle); 892 #endif 893 } else { 894 buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord); 895 } 896 kernel = PETSC_TRUE; 897 } else { 898 PetscCheck(size <= 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel"); 899 buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm); 900 } 901 PetscCall(MatSetUpMultiply_H2OPUS(A)); 902 903 #if defined(PETSC_H2OPUS_USE_GPU) 904 boundtocpu = A->boundtocpu; 905 if (!boundtocpu) { 906 if (size > 1) { 907 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 908 #if defined(H2OPUS_USE_MPI) 909 a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 910 #endif 911 } else { 912 a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 913 } 914 } 915 #endif 916 if (size == 1) { 917 if (!kernel && a->sampler && a->sampler->GetSamplingMat()) { 918 PetscReal Anorm; 919 bool verbose; 920 921 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL)); 922 verbose = a->hara_verbose; 923 PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm)); 924 if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs)); 925 if (a->sf && !a->nativemult) { 926 a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data()); 927 } 928 a->sampler->SetStream(handle->getMainStream()); 929 if (boundtocpu) { 930 a->sampler->SetGPUSampling(false); 931 hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose); 932 #if defined(PETSC_H2OPUS_USE_GPU) 933 } else { 934 a->sampler->SetGPUSampling(true); 935 hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose); 936 #endif 937 } 938 samplingdone = PETSC_TRUE; 939 } 940 } 941 #if defined(PETSC_H2OPUS_USE_GPU) 942 if (!boundtocpu) { 943 delete a->hmatrix; 944 delete a->dist_hmatrix; 945 a->hmatrix = NULL; 946 a->dist_hmatrix = NULL; 947 } 948 A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 949 #endif 950 PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0)); 951 952 if (!a->s) a->s = 1.0; 953 A->assembled = PETSC_TRUE; 954 955 if (samplingdone) { 956 PetscBool check = a->check_construction; 957 PetscBool checke = PETSC_FALSE; 958 959 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL)); 960 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL)); 961 if (check) { 962 Mat E,Ae; 963 PetscReal n1,ni,n2; 964 PetscReal n1A,niA,n2A; 965 void (*normfunc)(void); 966 967 Ae = a->sampler->GetSamplingMat(); 968 PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E)); 969 PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS)); 970 PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN)); 971 PetscCall(MatNorm(E,NORM_1,&n1)); 972 PetscCall(MatNorm(E,NORM_INFINITY,&ni)); 973 PetscCall(MatNorm(E,NORM_2,&n2)); 974 if (checke) { 975 Mat eA,eE,eAe; 976 977 PetscCall(MatComputeOperator(A,MATAIJ,&eA)); 978 PetscCall(MatComputeOperator(E,MATAIJ,&eE)); 979 PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe)); 980 PetscCall(MatChop(eA,PETSC_SMALL)); 981 PetscCall(MatChop(eE,PETSC_SMALL)); 982 PetscCall(MatChop(eAe,PETSC_SMALL)); 983 PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat")); 984 PetscCall(MatView(eA,NULL)); 985 PetscCall(PetscObjectSetName((PetscObject)eAe,"S")); 986 PetscCall(MatView(eAe,NULL)); 987 PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S")); 988 PetscCall(MatView(eE,NULL)); 989 PetscCall(MatDestroy(&eA)); 990 PetscCall(MatDestroy(&eE)); 991 PetscCall(MatDestroy(&eAe)); 992 } 993 994 PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc)); 995 PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS)); 996 PetscCall(MatNorm(Ae,NORM_1,&n1A)); 997 PetscCall(MatNorm(Ae,NORM_INFINITY,&niA)); 998 PetscCall(MatNorm(Ae,NORM_2,&n2A)); 999 n1A = PetscMax(n1A,PETSC_SMALL); 1000 n2A = PetscMax(n2A,PETSC_SMALL); 1001 niA = PetscMax(niA,PETSC_SMALL); 1002 PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc)); 1003 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A))); 1004 PetscCall(MatDestroy(&E)); 1005 } 1006 a->sampler->SetSamplingMat(NULL); 1007 } 1008 PetscFunctionReturn(0); 1009 } 1010 1011 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) 1012 { 1013 PetscMPIInt size; 1014 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1015 1016 PetscFunctionBegin; 1017 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1018 PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported"); 1019 else { 1020 a->hmatrix->clearData(); 1021 #if defined(PETSC_H2OPUS_USE_GPU) 1022 if (a->hmatrix_gpu) a->hmatrix_gpu->clearData(); 1023 #endif 1024 } 1025 PetscFunctionReturn(0); 1026 } 1027 1028 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) 1029 { 1030 Mat A; 1031 Mat_H2OPUS *a, *b = (Mat_H2OPUS*)B->data; 1032 #if defined(PETSC_H2OPUS_USE_GPU) 1033 PetscBool iscpu = PETSC_FALSE; 1034 #else 1035 PetscBool iscpu = PETSC_TRUE; 1036 #endif 1037 MPI_Comm comm; 1038 1039 PetscFunctionBegin; 1040 PetscCall(PetscObjectGetComm((PetscObject)B,&comm)); 1041 PetscCall(MatCreate(comm,&A)); 1042 PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N)); 1043 PetscCall(MatSetType(A,MATH2OPUS)); 1044 PetscCall(MatPropagateSymmetryOptions(B,A)); 1045 a = (Mat_H2OPUS*)A->data; 1046 1047 a->eta = b->eta; 1048 a->leafsize = b->leafsize; 1049 a->basisord = b->basisord; 1050 a->max_rank = b->max_rank; 1051 a->bs = b->bs; 1052 a->rtol = b->rtol; 1053 a->norm_max_samples = b->norm_max_samples; 1054 if (op == MAT_COPY_VALUES) a->s = b->s; 1055 1056 a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud); 1057 if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel); 1058 1059 #if defined(H2OPUS_USE_MPI) 1060 if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); } 1061 #if defined(PETSC_H2OPUS_USE_GPU) 1062 if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); } 1063 #endif 1064 #endif 1065 if (b->hmatrix) { 1066 a->hmatrix = new HMatrix(*b->hmatrix); 1067 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData(); 1068 } 1069 #if defined(PETSC_H2OPUS_USE_GPU) 1070 if (b->hmatrix_gpu) { 1071 a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu); 1072 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData(); 1073 } 1074 #endif 1075 if (b->sf) { 1076 PetscCall(PetscObjectReference((PetscObject)b->sf)); 1077 a->sf = b->sf; 1078 } 1079 if (b->h2opus_indexmap) { 1080 PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap)); 1081 a->h2opus_indexmap = b->h2opus_indexmap; 1082 } 1083 1084 PetscCall(MatSetUp(A)); 1085 PetscCall(MatSetUpMultiply_H2OPUS(A)); 1086 if (op == MAT_COPY_VALUES) { 1087 A->assembled = PETSC_TRUE; 1088 a->orthogonal = b->orthogonal; 1089 #if defined(PETSC_H2OPUS_USE_GPU) 1090 A->offloadmask = B->offloadmask; 1091 #endif 1092 } 1093 #if defined(PETSC_H2OPUS_USE_GPU) 1094 iscpu = B->boundtocpu; 1095 #endif 1096 PetscCall(MatBindToCPU(A,iscpu)); 1097 1098 *nA = A; 1099 PetscFunctionReturn(0); 1100 } 1101 1102 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) 1103 { 1104 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 1105 PetscBool isascii, vieweps; 1106 PetscMPIInt size; 1107 PetscViewerFormat format; 1108 1109 PetscFunctionBegin; 1110 PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii)); 1111 PetscCall(PetscViewerGetFormat(view,&format)); 1112 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1113 if (isascii) { 1114 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1115 if (size == 1) { 1116 FILE *fp; 1117 PetscCall(PetscViewerASCIIGetPointer(view,&fp)); 1118 dumpHMatrix(*h2opus->hmatrix,6,fp); 1119 } 1120 } else { 1121 PetscCall(PetscViewerASCIIPrintf(view," H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat")); 1122 PetscCall(PetscViewerASCIIPrintf(view," PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0)); 1123 PetscCall(PetscViewerASCIIPrintf(view," Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta)); 1124 if (!h2opus->kernel) { 1125 PetscCall(PetscViewerASCIIPrintf(view," Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol)); 1126 } else { 1127 PetscCall(PetscViewerASCIIPrintf(view," Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord)); 1128 } 1129 PetscCall(PetscViewerASCIIPrintf(view," Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples)); 1130 if (size == 1) { 1131 double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0; 1132 double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0; 1133 #if defined(PETSC_HAVE_CUDA) 1134 double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0; 1135 double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0; 1136 #endif 1137 PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu)); 1138 #if defined(PETSC_HAVE_CUDA) 1139 PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu)); 1140 #endif 1141 } else { 1142 #if defined(PETSC_HAVE_CUDA) 1143 double matrix_mem[4] = {0.,0.,0.,0.}; 1144 PetscMPIInt rsize = 4; 1145 #else 1146 double matrix_mem[2] = {0.,0.}; 1147 PetscMPIInt rsize = 2; 1148 #endif 1149 #if defined(H2OPUS_USE_MPI) 1150 matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0; 1151 matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0; 1152 #if defined(PETSC_HAVE_CUDA) 1153 matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0; 1154 matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0; 1155 #endif 1156 #endif 1157 PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A))); 1158 PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1])); 1159 #if defined(PETSC_HAVE_CUDA) 1160 PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3])); 1161 #endif 1162 } 1163 } 1164 } 1165 vieweps = PETSC_FALSE; 1166 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps",&vieweps,NULL)); 1167 if (vieweps) { 1168 char filename[256]; 1169 const char *name; 1170 1171 PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1172 PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name)); 1173 PetscCall(PetscOptionsGetString(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps_filename",filename,sizeof(filename),NULL)); 1174 outputEps(*h2opus->hmatrix,filename); 1175 } 1176 PetscFunctionReturn(0); 1177 } 1178 1179 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx) 1180 { 1181 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 1182 PetscReal *gcoords; 1183 PetscInt N; 1184 MPI_Comm comm; 1185 PetscMPIInt size; 1186 PetscBool cong; 1187 1188 PetscFunctionBegin; 1189 PetscCall(PetscLayoutSetUp(A->rmap)); 1190 PetscCall(PetscLayoutSetUp(A->cmap)); 1191 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 1192 PetscCall(MatHasCongruentLayouts(A,&cong)); 1193 PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts"); 1194 N = A->rmap->N; 1195 PetscCallMPI(MPI_Comm_size(comm,&size)); 1196 if (spacedim > 0 && size > 1 && cdist) { 1197 PetscSF sf; 1198 MPI_Datatype dtype; 1199 1200 PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype)); 1201 PetscCallMPI(MPI_Type_commit(&dtype)); 1202 1203 PetscCall(PetscSFCreate(comm,&sf)); 1204 PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER)); 1205 PetscCall(PetscMalloc1(spacedim*N,&gcoords)); 1206 PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE)); 1207 PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE)); 1208 PetscCall(PetscSFDestroy(&sf)); 1209 PetscCallMPI(MPI_Type_free(&dtype)); 1210 } else gcoords = (PetscReal*)coords; 1211 1212 delete h2opus->ptcloud; 1213 delete h2opus->kernel; 1214 h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords); 1215 if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx); 1216 if (gcoords != coords) PetscCall(PetscFree(gcoords)); 1217 A->preallocated = PETSC_TRUE; 1218 PetscFunctionReturn(0); 1219 } 1220 1221 #if defined(PETSC_H2OPUS_USE_GPU) 1222 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) 1223 { 1224 PetscMPIInt size; 1225 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1226 1227 PetscFunctionBegin; 1228 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1229 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) { 1230 if (size > 1) { 1231 PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1232 #if defined(H2OPUS_USE_MPI) 1233 if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu); 1234 else *a->dist_hmatrix = *a->dist_hmatrix_gpu; 1235 #endif 1236 } else { 1237 PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1238 if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu); 1239 else *a->hmatrix = *a->hmatrix_gpu; 1240 } 1241 delete a->hmatrix_gpu; 1242 delete a->dist_hmatrix_gpu; 1243 a->hmatrix_gpu = NULL; 1244 a->dist_hmatrix_gpu = NULL; 1245 A->offloadmask = PETSC_OFFLOAD_CPU; 1246 } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) { 1247 if (size > 1) { 1248 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1249 #if defined(H2OPUS_USE_MPI) 1250 if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 1251 else *a->dist_hmatrix_gpu = *a->dist_hmatrix; 1252 #endif 1253 } else { 1254 PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1255 if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 1256 else *a->hmatrix_gpu = *a->hmatrix; 1257 } 1258 delete a->hmatrix; 1259 delete a->dist_hmatrix; 1260 a->hmatrix = NULL; 1261 a->dist_hmatrix = NULL; 1262 A->offloadmask = PETSC_OFFLOAD_GPU; 1263 } 1264 PetscCall(PetscFree(A->defaultvectype)); 1265 if (!flg) { 1266 PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 1267 } else { 1268 PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype)); 1269 } 1270 A->boundtocpu = flg; 1271 PetscFunctionReturn(0); 1272 } 1273 #endif 1274 1275 /*MC 1276 MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package. 1277 1278 Options Database Keys: 1279 . -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions() 1280 1281 Notes: 1282 H2Opus implements hierarchical matrices in the H^2 flavour. 1283 It supports CPU or NVIDIA GPUs. 1284 For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus. 1285 In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas. 1286 For details and additional references, see 1287 "H2Opus: A distributed-memory multi-GPU software package for non-local operators", 1288 available at https://arxiv.org/abs/2109.05451. 1289 1290 Level: beginner 1291 1292 .seealso: `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()` 1293 M*/ 1294 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) 1295 { 1296 Mat_H2OPUS *a; 1297 PetscMPIInt size; 1298 1299 PetscFunctionBegin; 1300 #if defined(PETSC_H2OPUS_USE_GPU) 1301 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1302 #endif 1303 PetscCall(PetscNewLog(A,&a)); 1304 A->data = (void*)a; 1305 1306 a->eta = 0.9; 1307 a->leafsize = 32; 1308 a->basisord = 4; 1309 a->max_rank = 64; 1310 a->bs = 32; 1311 a->rtol = 1.e-4; 1312 a->s = 1.0; 1313 a->norm_max_samples = 10; 1314 a->resize = PETSC_TRUE; /* reallocate after compression */ 1315 #if defined(H2OPUS_USE_MPI) 1316 h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A)); 1317 #else 1318 h2opusCreateHandle(&a->handle); 1319 #endif 1320 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1321 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS)); 1322 PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps))); 1323 1324 A->ops->destroy = MatDestroy_H2OPUS; 1325 A->ops->view = MatView_H2OPUS; 1326 A->ops->assemblyend = MatAssemblyEnd_H2OPUS; 1327 A->ops->mult = MatMult_H2OPUS; 1328 A->ops->multtranspose = MatMultTranspose_H2OPUS; 1329 A->ops->multadd = MatMultAdd_H2OPUS; 1330 A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS; 1331 A->ops->scale = MatScale_H2OPUS; 1332 A->ops->duplicate = MatDuplicate_H2OPUS; 1333 A->ops->setfromoptions = MatSetFromOptions_H2OPUS; 1334 A->ops->norm = MatNorm_H2OPUS; 1335 A->ops->zeroentries = MatZeroEntries_H2OPUS; 1336 #if defined(PETSC_H2OPUS_USE_GPU) 1337 A->ops->bindtocpu = MatBindToCPU_H2OPUS; 1338 #endif 1339 1340 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS)); 1341 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS)); 1342 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS)); 1343 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS)); 1344 #if defined(PETSC_H2OPUS_USE_GPU) 1345 PetscCall(PetscFree(A->defaultvectype)); 1346 PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 1347 #endif 1348 PetscFunctionReturn(0); 1349 } 1350 1351 /*@C 1352 MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix. 1353 1354 Input Parameter: 1355 . A - the matrix 1356 1357 Level: intermediate 1358 1359 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()` 1360 @*/ 1361 PetscErrorCode MatH2OpusOrthogonalize(Mat A) 1362 { 1363 PetscBool ish2opus; 1364 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1365 PetscMPIInt size; 1366 PetscBool boundtocpu = PETSC_TRUE; 1367 1368 PetscFunctionBegin; 1369 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1370 PetscValidType(A,1); 1371 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1372 if (!ish2opus) PetscFunctionReturn(0); 1373 if (a->orthogonal) PetscFunctionReturn(0); 1374 HLibProfile::clear(); 1375 PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0)); 1376 #if defined(PETSC_H2OPUS_USE_GPU) 1377 boundtocpu = A->boundtocpu; 1378 #endif 1379 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1380 if (size > 1) { 1381 if (boundtocpu) { 1382 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1383 #if defined(H2OPUS_USE_MPI) 1384 distributed_horthog(*a->dist_hmatrix, a->handle); 1385 #endif 1386 #if defined(PETSC_H2OPUS_USE_GPU) 1387 A->offloadmask = PETSC_OFFLOAD_CPU; 1388 } else { 1389 PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1390 PetscCall(PetscLogGpuTimeBegin()); 1391 #if defined(H2OPUS_USE_MPI) 1392 distributed_horthog(*a->dist_hmatrix_gpu, a->handle); 1393 #endif 1394 PetscCall(PetscLogGpuTimeEnd()); 1395 #endif 1396 } 1397 } else { 1398 #if defined(H2OPUS_USE_MPI) 1399 h2opusHandle_t handle = a->handle->handle; 1400 #else 1401 h2opusHandle_t handle = a->handle; 1402 #endif 1403 if (boundtocpu) { 1404 PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1405 horthog(*a->hmatrix, handle); 1406 #if defined(PETSC_H2OPUS_USE_GPU) 1407 A->offloadmask = PETSC_OFFLOAD_CPU; 1408 } else { 1409 PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1410 PetscCall(PetscLogGpuTimeBegin()); 1411 horthog(*a->hmatrix_gpu, handle); 1412 PetscCall(PetscLogGpuTimeEnd()); 1413 #endif 1414 } 1415 } 1416 a->orthogonal = PETSC_TRUE; 1417 { /* log flops */ 1418 double gops,time,perf,dev; 1419 HLibProfile::getHorthogPerf(gops,time,perf,dev); 1420 #if defined(PETSC_H2OPUS_USE_GPU) 1421 if (boundtocpu) { 1422 PetscCall(PetscLogFlops(1e9*gops)); 1423 } else { 1424 PetscCall(PetscLogGpuFlops(1e9*gops)); 1425 } 1426 #else 1427 PetscCall(PetscLogFlops(1e9*gops)); 1428 #endif 1429 } 1430 PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0)); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 /*@C 1435 MatH2OpusCompress - Compress a hierarchical matrix. 1436 1437 Input Parameters: 1438 + A - the matrix 1439 - tol - the absolute truncation threshold 1440 1441 Level: intermediate 1442 1443 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()` 1444 @*/ 1445 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) 1446 { 1447 PetscBool ish2opus; 1448 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1449 PetscMPIInt size; 1450 PetscBool boundtocpu = PETSC_TRUE; 1451 1452 PetscFunctionBegin; 1453 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1454 PetscValidType(A,1); 1455 PetscValidLogicalCollectiveReal(A,tol,2); 1456 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1457 if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0); 1458 PetscCall(MatH2OpusOrthogonalize(A)); 1459 HLibProfile::clear(); 1460 PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0)); 1461 #if defined(PETSC_H2OPUS_USE_GPU) 1462 boundtocpu = A->boundtocpu; 1463 #endif 1464 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1465 if (size > 1) { 1466 if (boundtocpu) { 1467 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1468 #if defined(H2OPUS_USE_MPI) 1469 distributed_hcompress(*a->dist_hmatrix, tol, a->handle); 1470 if (a->resize) { 1471 DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix); 1472 delete a->dist_hmatrix; 1473 a->dist_hmatrix = dist_hmatrix; 1474 } 1475 #endif 1476 #if defined(PETSC_H2OPUS_USE_GPU) 1477 A->offloadmask = PETSC_OFFLOAD_CPU; 1478 } else { 1479 PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1480 PetscCall(PetscLogGpuTimeBegin()); 1481 #if defined(H2OPUS_USE_MPI) 1482 distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle); 1483 1484 if (a->resize) { 1485 DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu); 1486 delete a->dist_hmatrix_gpu; 1487 a->dist_hmatrix_gpu = dist_hmatrix_gpu; 1488 } 1489 #endif 1490 PetscCall(PetscLogGpuTimeEnd()); 1491 #endif 1492 } 1493 } else { 1494 #if defined(H2OPUS_USE_MPI) 1495 h2opusHandle_t handle = a->handle->handle; 1496 #else 1497 h2opusHandle_t handle = a->handle; 1498 #endif 1499 if (boundtocpu) { 1500 PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1501 hcompress(*a->hmatrix, tol, handle); 1502 1503 if (a->resize) { 1504 HMatrix *hmatrix = new HMatrix(*a->hmatrix); 1505 delete a->hmatrix; 1506 a->hmatrix = hmatrix; 1507 } 1508 #if defined(PETSC_H2OPUS_USE_GPU) 1509 A->offloadmask = PETSC_OFFLOAD_CPU; 1510 } else { 1511 PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1512 PetscCall(PetscLogGpuTimeBegin()); 1513 hcompress(*a->hmatrix_gpu, tol, handle); 1514 PetscCall(PetscLogGpuTimeEnd()); 1515 1516 if (a->resize) { 1517 HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu); 1518 delete a->hmatrix_gpu; 1519 a->hmatrix_gpu = hmatrix_gpu; 1520 } 1521 #endif 1522 } 1523 } 1524 { /* log flops */ 1525 double gops,time,perf,dev; 1526 HLibProfile::getHcompressPerf(gops,time,perf,dev); 1527 #if defined(PETSC_H2OPUS_USE_GPU) 1528 if (boundtocpu) { 1529 PetscCall(PetscLogFlops(1e9*gops)); 1530 } else { 1531 PetscCall(PetscLogGpuFlops(1e9*gops)); 1532 } 1533 #else 1534 PetscCall(PetscLogFlops(1e9*gops)); 1535 #endif 1536 } 1537 PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0)); 1538 PetscFunctionReturn(0); 1539 } 1540 1541 /*@C 1542 MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix. 1543 1544 Input Parameters: 1545 + A - the hierarchical matrix 1546 . B - the matrix to be sampled 1547 . bs - maximum number of samples to be taken concurrently 1548 - tol - relative tolerance for construction 1549 1550 Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix. 1551 1552 Level: intermediate 1553 1554 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()` 1555 @*/ 1556 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) 1557 { 1558 PetscBool ish2opus; 1559 1560 PetscFunctionBegin; 1561 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1562 PetscValidType(A,1); 1563 if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2); 1564 PetscValidLogicalCollectiveInt(A,bs,3); 1565 PetscValidLogicalCollectiveReal(A,tol,3); 1566 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1567 if (ish2opus) { 1568 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1569 1570 if (!a->sampler) a->sampler = new PetscMatrixSampler(); 1571 a->sampler->SetSamplingMat(B); 1572 if (bs > 0) a->bs = bs; 1573 if (tol > 0.) a->rtol = tol; 1574 delete a->kernel; 1575 } 1576 PetscFunctionReturn(0); 1577 } 1578 1579 /*@C 1580 MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel. 1581 1582 Input Parameters: 1583 + comm - MPI communicator 1584 . m - number of local rows (or PETSC_DECIDE to have calculated if M is given) 1585 . n - number of local columns (or PETSC_DECIDE to have calculated if N is given) 1586 . M - number of global rows (or PETSC_DETERMINE to have calculated if m is given) 1587 . N - number of global columns (or PETSC_DETERMINE to have calculated if n is given) 1588 . spacedim - dimension of the space coordinates 1589 . coords - coordinates of the points 1590 . cdist - whether or not coordinates are distributed 1591 . kernel - computational kernel (or NULL) 1592 . kernelctx - kernel context 1593 . eta - admissibility condition tolerance 1594 . leafsize - leaf size in cluster tree 1595 - basisord - approximation order for Chebychev interpolation of low-rank blocks 1596 1597 Output Parameter: 1598 . nA - matrix 1599 1600 Options Database Keys: 1601 + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree 1602 . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance 1603 . -mat_h2opus_order <PetscInt> - Chebychev approximation order 1604 - -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms 1605 1606 Level: intermediate 1607 1608 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()` 1609 @*/ 1610 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA) 1611 { 1612 Mat A; 1613 Mat_H2OPUS *h2opus; 1614 #if defined(PETSC_H2OPUS_USE_GPU) 1615 PetscBool iscpu = PETSC_FALSE; 1616 #else 1617 PetscBool iscpu = PETSC_TRUE; 1618 #endif 1619 1620 PetscFunctionBegin; 1621 PetscCheck(m == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 1622 PetscCall(MatCreate(comm,&A)); 1623 PetscCall(MatSetSizes(A,m,n,M,N)); 1624 PetscCheck(M == N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 1625 PetscCall(MatSetType(A,MATH2OPUS)); 1626 PetscCall(MatBindToCPU(A,iscpu)); 1627 PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx)); 1628 1629 h2opus = (Mat_H2OPUS*)A->data; 1630 if (eta > 0.) h2opus->eta = eta; 1631 if (leafsize > 0) h2opus->leafsize = leafsize; 1632 if (basisord > 0) h2opus->basisord = basisord; 1633 1634 *nA = A; 1635 PetscFunctionReturn(0); 1636 } 1637 1638 /*@C 1639 MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator. 1640 1641 Input Parameters: 1642 + B - the matrix to be sampled 1643 . spacedim - dimension of the space coordinates 1644 . coords - coordinates of the points 1645 . cdist - whether or not coordinates are distributed 1646 . eta - admissibility condition tolerance 1647 . leafsize - leaf size in cluster tree 1648 . maxrank - maximum rank allowed 1649 . bs - maximum number of samples to be taken concurrently 1650 - rtol - relative tolerance for construction 1651 1652 Output Parameter: 1653 . nA - matrix 1654 1655 Options Database Keys: 1656 + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree 1657 . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance 1658 . -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs 1659 . -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs 1660 . -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling 1661 . -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd() 1662 . -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction 1663 - -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms 1664 1665 Notes: not available in parallel 1666 1667 Level: intermediate 1668 1669 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()` 1670 @*/ 1671 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) 1672 { 1673 Mat A; 1674 Mat_H2OPUS *h2opus; 1675 MPI_Comm comm; 1676 PetscBool boundtocpu = PETSC_TRUE; 1677 1678 PetscFunctionBegin; 1679 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 1680 PetscValidLogicalCollectiveInt(B,spacedim,2); 1681 PetscValidLogicalCollectiveBool(B,cdist,4); 1682 PetscValidLogicalCollectiveReal(B,eta,5); 1683 PetscValidLogicalCollectiveInt(B,leafsize,6); 1684 PetscValidLogicalCollectiveInt(B,maxrank,7); 1685 PetscValidLogicalCollectiveInt(B,bs,8); 1686 PetscValidLogicalCollectiveReal(B,rtol,9); 1687 PetscValidPointer(nA,10); 1688 PetscCall(PetscObjectGetComm((PetscObject)B,&comm)); 1689 PetscCheck(B->rmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 1690 PetscCheck(B->rmap->N == B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 1691 PetscCall(MatCreate(comm,&A)); 1692 PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N)); 1693 #if defined(PETSC_H2OPUS_USE_GPU) 1694 { 1695 PetscBool iscuda; 1696 VecType vtype; 1697 1698 PetscCall(MatGetVecType(B,&vtype)); 1699 PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda)); 1700 if (!iscuda) { 1701 PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda)); 1702 if (!iscuda) { 1703 PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda)); 1704 } 1705 } 1706 if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE; 1707 } 1708 #endif 1709 PetscCall(MatSetType(A,MATH2OPUS)); 1710 PetscCall(MatBindToCPU(A,boundtocpu)); 1711 if (spacedim) { 1712 PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL)); 1713 } 1714 PetscCall(MatPropagateSymmetryOptions(B,A)); 1715 /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */ 1716 1717 h2opus = (Mat_H2OPUS*)A->data; 1718 h2opus->sampler = new PetscMatrixSampler(B); 1719 if (eta > 0.) h2opus->eta = eta; 1720 if (leafsize > 0) h2opus->leafsize = leafsize; 1721 if (maxrank > 0) h2opus->max_rank = maxrank; 1722 if (bs > 0) h2opus->bs = bs; 1723 if (rtol > 0.) h2opus->rtol = rtol; 1724 *nA = A; 1725 A->preallocated = PETSC_TRUE; 1726 PetscFunctionReturn(0); 1727 } 1728 1729 /*@C 1730 MatH2OpusGetIndexMap - Access reordering index set. 1731 1732 Input Parameters: 1733 . A - the matrix 1734 1735 Output Parameter: 1736 . indexmap - the index set for the reordering 1737 1738 Level: intermediate 1739 1740 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1741 @*/ 1742 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) 1743 { 1744 PetscBool ish2opus; 1745 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1746 1747 PetscFunctionBegin; 1748 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1749 PetscValidType(A,1); 1750 PetscValidPointer(indexmap,2); 1751 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1752 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1753 PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 1754 *indexmap = a->h2opus_indexmap; 1755 PetscFunctionReturn(0); 1756 } 1757 1758 /*@C 1759 MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering 1760 1761 Input Parameters: 1762 + A - the matrix 1763 . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map 1764 - in - the vector to be mapped 1765 1766 Output Parameter: 1767 . out - the newly created mapped vector 1768 1769 Level: intermediate 1770 1771 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1772 @*/ 1773 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out) 1774 { 1775 PetscBool ish2opus; 1776 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1777 PetscScalar *xin,*xout; 1778 PetscBool nm; 1779 1780 PetscFunctionBegin; 1781 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1782 PetscValidType(A,1); 1783 PetscValidLogicalCollectiveBool(A,nativetopetsc,2); 1784 PetscValidHeaderSpecific(in,VEC_CLASSID,3); 1785 PetscValidPointer(out,4); 1786 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1787 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1788 PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 1789 nm = a->nativemult; 1790 PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc)); 1791 PetscCall(MatCreateVecs(A,out,NULL)); 1792 PetscCall(MatH2OpusSetNativeMult(A,nm)); 1793 if (!a->sf) { /* same ordering */ 1794 PetscCall(VecCopy(in,*out)); 1795 PetscFunctionReturn(0); 1796 } 1797 PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin)); 1798 PetscCall(VecGetArrayWrite(*out,&xout)); 1799 if (nativetopetsc) { 1800 PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1801 PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1802 } else { 1803 PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1804 PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1805 } 1806 PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin)); 1807 PetscCall(VecRestoreArrayWrite(*out,&xout)); 1808 PetscFunctionReturn(0); 1809 } 1810 1811 /*@C 1812 MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T 1813 1814 Input Parameters: 1815 + A - the hierarchical matrix 1816 . s - the scaling factor 1817 . U - the dense low-rank update matrix 1818 - V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed) 1819 1820 Notes: The U and V matrices must be in dense format 1821 1822 Level: intermediate 1823 1824 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE` 1825 @*/ 1826 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) 1827 { 1828 PetscBool flg; 1829 1830 PetscFunctionBegin; 1831 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1832 PetscValidType(A,1); 1833 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1834 PetscValidHeaderSpecific(U,MAT_CLASSID,2); 1835 PetscCheckSameComm(A,1,U,2); 1836 if (V) { 1837 PetscValidHeaderSpecific(V,MAT_CLASSID,3); 1838 PetscCheckSameComm(A,1,V,3); 1839 } 1840 PetscValidLogicalCollectiveScalar(A,s,4); 1841 1842 if (!V) V = U; 1843 PetscCheck(U->cmap->N == V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N); 1844 if (!U->cmap->N) PetscFunctionReturn(0); 1845 PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg)); 1846 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout"); 1847 PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg)); 1848 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout"); 1849 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg)); 1850 if (flg) { 1851 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1852 const PetscScalar *u,*v,*uu,*vv; 1853 PetscInt ldu,ldv; 1854 PetscMPIInt size; 1855 #if defined(H2OPUS_USE_MPI) 1856 h2opusHandle_t handle = a->handle->handle; 1857 #else 1858 h2opusHandle_t handle = a->handle; 1859 #endif 1860 PetscBool usesf = (PetscBool)(a->sf && !a->nativemult); 1861 PetscSF usf,vsf; 1862 1863 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1864 PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel"); 1865 PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0)); 1866 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,"")); 1867 PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name); 1868 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,"")); 1869 PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name); 1870 PetscCall(MatDenseGetLDA(U,&ldu)); 1871 PetscCall(MatDenseGetLDA(V,&ldv)); 1872 PetscCall(MatBoundToCPU(A,&flg)); 1873 if (usesf) { 1874 PetscInt n; 1875 1876 PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf)); 1877 PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf)); 1878 PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N)); 1879 PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL)); 1880 ldu = n; 1881 ldv = n; 1882 } 1883 if (flg) { 1884 PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1885 PetscCall(MatDenseGetArrayRead(U,&u)); 1886 PetscCall(MatDenseGetArrayRead(V,&v)); 1887 if (usesf) { 1888 vv = MatH2OpusGetThrustPointer(*a->yy); 1889 PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1890 PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1891 if (U != V) { 1892 uu = MatH2OpusGetThrustPointer(*a->xx); 1893 PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1894 PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1895 } else uu = vv; 1896 } else { uu = u; vv = v; } 1897 hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle); 1898 PetscCall(MatDenseRestoreArrayRead(U,&u)); 1899 PetscCall(MatDenseRestoreArrayRead(V,&v)); 1900 } else { 1901 #if defined(PETSC_H2OPUS_USE_GPU) 1902 PetscBool flgU, flgV; 1903 1904 PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1905 PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,"")); 1906 if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U)); 1907 PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,"")); 1908 if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V)); 1909 PetscCall(MatDenseCUDAGetArrayRead(U,&u)); 1910 PetscCall(MatDenseCUDAGetArrayRead(V,&v)); 1911 if (usesf) { 1912 vv = MatH2OpusGetThrustPointer(*a->yy_gpu); 1913 PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1914 PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1915 if (U != V) { 1916 uu = MatH2OpusGetThrustPointer(*a->xx_gpu); 1917 PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1918 PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1919 } else uu = vv; 1920 } else { uu = u; vv = v; } 1921 #else 1922 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen"); 1923 #endif 1924 hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle); 1925 #if defined(PETSC_H2OPUS_USE_GPU) 1926 PetscCall(MatDenseCUDARestoreArrayRead(U,&u)); 1927 PetscCall(MatDenseCUDARestoreArrayRead(V,&v)); 1928 if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U)); 1929 if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V)); 1930 #endif 1931 } 1932 PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0)); 1933 a->orthogonal = PETSC_FALSE; 1934 } 1935 PetscFunctionReturn(0); 1936 } 1937 #endif 1938