1 #include <h2opusconf.h> 2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */ 3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU) 4 #include <h2opus.h> 5 #if defined(H2OPUS_USE_MPI) 6 #include <h2opus/distributed/distributed_h2opus_handle.h> 7 #include <h2opus/distributed/distributed_geometric_construction.h> 8 #include <h2opus/distributed/distributed_hgemv.h> 9 #include <h2opus/distributed/distributed_horthog.h> 10 #include <h2opus/distributed/distributed_hcompress.h> 11 #endif 12 #include <h2opus/util/boxentrygen.h> 13 #include <petsc/private/matimpl.h> 14 #include <petsc/private/vecimpl.h> 15 #include <petsc/private/deviceimpl.h> 16 #include <petscsf.h> 17 18 /* math2opusutils */ 19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*); 20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*); 21 PETSC_INTERN PetscErrorCode VecSign(Vec,Vec); 22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt); 23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*); 24 25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data()) 26 27 /* Use GPU only if H2OPUS is configured for GPU */ 28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU) 29 #define PETSC_H2OPUS_USE_GPU 30 #endif 31 #if defined(PETSC_H2OPUS_USE_GPU) 32 #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B))) 33 #else 34 #define MatH2OpusUpdateIfNeeded(A,B) 0 35 #endif 36 37 // TODO H2OPUS: 38 // DistributedHMatrix 39 // unsymmetric ? 40 // transpose for distributed_hgemv? 41 // clearData() 42 // Unify interface for sequential and parallel? 43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled) 44 // 45 template <class T> class PetscPointCloud : public H2OpusDataSet<T> 46 { 47 private: 48 int dimension; 49 size_t num_points; 50 std::vector<T> pts; 51 52 public: 53 PetscPointCloud(int dim, size_t num_pts, const T coords[]) 54 { 55 dim = dim > 0 ? dim : 1; 56 this->dimension = dim; 57 this->num_points = num_pts; 58 59 pts.resize(num_pts*dim); 60 if (coords) { 61 for (size_t n = 0; n < num_points; n++) 62 for (int i = 0; i < dim; i++) 63 pts[n*dim + i] = coords[n*dim + i]; 64 } else { 65 PetscReal h = 1./(num_points - 1); 66 for (size_t n = 0; n < num_points; n++) 67 for (int i = 0; i < dim; i++) 68 pts[n*dim + i] = i*h; 69 } 70 } 71 72 PetscPointCloud(const PetscPointCloud<T>& other) 73 { 74 size_t N = other.dimension * other.num_points; 75 this->dimension = other.dimension; 76 this->num_points = other.num_points; 77 this->pts.resize(N); 78 for (size_t i = 0; i < N; i++) 79 this->pts[i] = other.pts[i]; 80 } 81 82 int getDimension() const 83 { 84 return dimension; 85 } 86 87 size_t getDataSetSize() const 88 { 89 return num_points; 90 } 91 92 T getDataPoint(size_t idx, int dim) const 93 { 94 assert(dim < dimension && idx < num_points); 95 return pts[idx*dimension + dim]; 96 } 97 98 void Print(std::ostream& out = std::cout) 99 { 100 out << "Dimension: " << dimension << std::endl; 101 out << "NumPoints: " << num_points << std::endl; 102 for (size_t n = 0; n < num_points; n++) { 103 for (int d = 0; d < dimension; d++) 104 out << pts[n*dimension + d] << " "; 105 out << std::endl; 106 } 107 } 108 }; 109 110 template<class T> class PetscFunctionGenerator 111 { 112 private: 113 MatH2OpusKernel k; 114 int dim; 115 void *ctx; 116 117 public: 118 PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; } 119 PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; } 120 T operator()(PetscReal *pt1, PetscReal *pt2) 121 { 122 return (T)((*this->k)(this->dim,pt1,pt2,this->ctx)); 123 } 124 }; 125 126 #include <../src/mat/impls/h2opus/math2opussampler.hpp> 127 128 /* just to not clutter the code */ 129 #if !defined(H2OPUS_USE_GPU) 130 typedef HMatrix HMatrix_GPU; 131 #if defined(H2OPUS_USE_MPI) 132 typedef DistributedHMatrix DistributedHMatrix_GPU; 133 #endif 134 #endif 135 136 typedef struct { 137 #if defined(H2OPUS_USE_MPI) 138 distributedH2OpusHandle_t handle; 139 #else 140 h2opusHandle_t handle; 141 #endif 142 /* Sequential and parallel matrices are two different classes at the moment */ 143 HMatrix *hmatrix; 144 #if defined(H2OPUS_USE_MPI) 145 DistributedHMatrix *dist_hmatrix; 146 #else 147 HMatrix *dist_hmatrix; /* just to not clutter the code */ 148 #endif 149 /* May use permutations */ 150 PetscSF sf; 151 PetscLayout h2opus_rmap, h2opus_cmap; 152 IS h2opus_indexmap; 153 thrust::host_vector<PetscScalar> *xx,*yy; 154 PetscInt xxs,yys; 155 PetscBool multsetup; 156 157 /* GPU */ 158 HMatrix_GPU *hmatrix_gpu; 159 #if defined(H2OPUS_USE_MPI) 160 DistributedHMatrix_GPU *dist_hmatrix_gpu; 161 #else 162 HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */ 163 #endif 164 #if defined(PETSC_H2OPUS_USE_GPU) 165 thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu; 166 PetscInt xxs_gpu,yys_gpu; 167 #endif 168 169 /* construction from matvecs */ 170 PetscMatrixSampler* sampler; 171 PetscBool nativemult; 172 173 /* Admissibility */ 174 PetscReal eta; 175 PetscInt leafsize; 176 177 /* for dof reordering */ 178 PetscPointCloud<PetscReal> *ptcloud; 179 180 /* kernel for generating matrix entries */ 181 PetscFunctionGenerator<PetscScalar> *kernel; 182 183 /* basis orthogonalized? */ 184 PetscBool orthogonal; 185 186 /* customization */ 187 PetscInt basisord; 188 PetscInt max_rank; 189 PetscInt bs; 190 PetscReal rtol; 191 PetscInt norm_max_samples; 192 PetscBool check_construction; 193 PetscBool hara_verbose; 194 195 /* keeps track of MatScale values */ 196 PetscScalar s; 197 } Mat_H2OPUS; 198 199 static PetscErrorCode MatDestroy_H2OPUS(Mat A) 200 { 201 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 202 203 PetscFunctionBegin; 204 #if defined(H2OPUS_USE_MPI) 205 h2opusDestroyDistributedHandle(a->handle); 206 #else 207 h2opusDestroyHandle(a->handle); 208 #endif 209 delete a->dist_hmatrix; 210 delete a->hmatrix; 211 PetscCall(PetscSFDestroy(&a->sf)); 212 PetscCall(PetscLayoutDestroy(&a->h2opus_rmap)); 213 PetscCall(PetscLayoutDestroy(&a->h2opus_cmap)); 214 PetscCall(ISDestroy(&a->h2opus_indexmap)); 215 delete a->xx; 216 delete a->yy; 217 delete a->hmatrix_gpu; 218 delete a->dist_hmatrix_gpu; 219 #if defined(PETSC_H2OPUS_USE_GPU) 220 delete a->xx_gpu; 221 delete a->yy_gpu; 222 #endif 223 delete a->sampler; 224 delete a->ptcloud; 225 delete a->kernel; 226 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL)); 227 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL)); 228 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL)); 229 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL)); 230 PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 231 PetscCall(PetscFree(A->data)); 232 PetscFunctionReturn(0); 233 } 234 235 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) 236 { 237 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 238 PetscBool ish2opus; 239 240 PetscFunctionBegin; 241 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 242 PetscValidLogicalCollectiveBool(A,nm,2); 243 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 244 if (ish2opus) { 245 if (a->h2opus_rmap) { /* need to swap layouts for vector creation */ 246 if ((!a->nativemult && nm) || (a->nativemult && !nm)) { 247 PetscLayout t; 248 t = A->rmap; 249 A->rmap = a->h2opus_rmap; 250 a->h2opus_rmap = t; 251 t = A->cmap; 252 A->cmap = a->h2opus_cmap; 253 a->h2opus_cmap = t; 254 } 255 } 256 a->nativemult = nm; 257 } 258 PetscFunctionReturn(0); 259 } 260 261 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) 262 { 263 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 264 PetscBool ish2opus; 265 266 PetscFunctionBegin; 267 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 268 PetscValidPointer(nm,2); 269 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 270 PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 271 *nm = a->nativemult; 272 PetscFunctionReturn(0); 273 } 274 275 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n) 276 { 277 PetscBool ish2opus; 278 PetscInt nmax = PETSC_DECIDE; 279 Mat_H2OPUS *a = NULL; 280 PetscBool mult = PETSC_FALSE; 281 282 PetscFunctionBegin; 283 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 284 if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */ 285 a = (Mat_H2OPUS*)A->data; 286 287 nmax = a->norm_max_samples; 288 mult = a->nativemult; 289 PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE)); 290 } else { 291 PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL)); 292 } 293 PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n)); 294 if (a) PetscCall(MatH2OpusSetNativeMult(A,mult)); 295 PetscFunctionReturn(0); 296 } 297 298 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) 299 { 300 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 301 PetscInt n; 302 PetscBool boundtocpu = PETSC_TRUE; 303 304 PetscFunctionBegin; 305 #if defined(PETSC_H2OPUS_USE_GPU) 306 boundtocpu = A->boundtocpu; 307 #endif 308 PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 309 if (boundtocpu) { 310 if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; } 311 if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; } 312 } 313 #if defined(PETSC_H2OPUS_USE_GPU) 314 if (!boundtocpu) { 315 if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; } 316 if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; } 317 } 318 #endif 319 PetscFunctionReturn(0); 320 } 321 322 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) 323 { 324 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 325 #if defined(H2OPUS_USE_MPI) 326 h2opusHandle_t handle = h2opus->handle->handle; 327 #else 328 h2opusHandle_t handle = h2opus->handle; 329 #endif 330 PetscBool boundtocpu = PETSC_TRUE; 331 PetscScalar *xx,*yy,*uxx,*uyy; 332 PetscInt blda,clda; 333 PetscMPIInt size; 334 PetscSF bsf,csf; 335 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 336 337 PetscFunctionBegin; 338 HLibProfile::clear(); 339 #if defined(PETSC_H2OPUS_USE_GPU) 340 boundtocpu = A->boundtocpu; 341 #endif 342 PetscCall(MatDenseGetLDA(B,&blda)); 343 PetscCall(MatDenseGetLDA(C,&clda)); 344 if (usesf) { 345 PetscInt n; 346 347 PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf)); 348 PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf)); 349 350 PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N)); 351 PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 352 blda = n; 353 clda = n; 354 } 355 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 356 if (boundtocpu) { 357 PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx)); 358 PetscCall(MatDenseGetArrayWrite(C,&yy)); 359 if (usesf) { 360 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 361 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 362 PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 363 PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 364 } else { 365 uxx = xx; 366 uyy = yy; 367 } 368 if (size > 1) { 369 PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 370 PetscCheckFalse(transA && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 371 #if defined(H2OPUS_USE_MPI) 372 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 373 #endif 374 } else { 375 PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 376 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 377 } 378 PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx)); 379 if (usesf) { 380 PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 381 PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 382 } 383 PetscCall(MatDenseRestoreArrayWrite(C,&yy)); 384 #if defined(PETSC_H2OPUS_USE_GPU) 385 } else { 386 PetscBool ciscuda,biscuda; 387 388 /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */ 389 PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 390 if (!biscuda) { 391 PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B)); 392 } 393 PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 394 if (!ciscuda) { 395 C->assembled = PETSC_TRUE; 396 PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C)); 397 } 398 PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx)); 399 PetscCall(MatDenseCUDAGetArrayWrite(C,&yy)); 400 if (usesf) { 401 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 402 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 403 PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 404 PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 405 } else { 406 uxx = xx; 407 uyy = yy; 408 } 409 PetscCall(PetscLogGpuTimeBegin()); 410 if (size > 1) { 411 PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix"); 412 PetscCheckFalse(transA && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 413 #if defined(H2OPUS_USE_MPI) 414 distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 415 #endif 416 } else { 417 PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 418 hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 419 } 420 PetscCall(PetscLogGpuTimeEnd()); 421 PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx)); 422 if (usesf) { 423 PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 424 PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 425 } 426 PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy)); 427 if (!biscuda) { 428 PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B)); 429 } 430 if (!ciscuda) { 431 PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C)); 432 } 433 #endif 434 } 435 { /* log flops */ 436 double gops,time,perf,dev; 437 HLibProfile::getHgemvPerf(gops,time,perf,dev); 438 #if defined(PETSC_H2OPUS_USE_GPU) 439 if (boundtocpu) { 440 PetscCall(PetscLogFlops(1e9*gops)); 441 } else { 442 PetscCall(PetscLogGpuFlops(1e9*gops)); 443 } 444 #else 445 PetscCall(PetscLogFlops(1e9*gops)); 446 #endif 447 } 448 PetscFunctionReturn(0); 449 } 450 451 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) 452 { 453 Mat_Product *product = C->product; 454 455 PetscFunctionBegin; 456 MatCheckProduct(C,1); 457 switch (product->type) { 458 case MATPRODUCT_AB: 459 PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C)); 460 break; 461 case MATPRODUCT_AtB: 462 PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C)); 463 break; 464 default: 465 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]); 466 } 467 PetscFunctionReturn(0); 468 } 469 470 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) 471 { 472 Mat_Product *product = C->product; 473 PetscBool cisdense; 474 Mat A,B; 475 476 PetscFunctionBegin; 477 MatCheckProduct(C,1); 478 A = product->A; 479 B = product->B; 480 switch (product->type) { 481 case MATPRODUCT_AB: 482 PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N)); 483 PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B)); 484 PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 485 if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name)); 486 PetscCall(MatSetUp(C)); 487 break; 488 case MATPRODUCT_AtB: 489 PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N)); 490 PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B)); 491 PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 492 if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name)); 493 PetscCall(MatSetUp(C)); 494 break; 495 default: 496 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]); 497 } 498 C->ops->productsymbolic = NULL; 499 C->ops->productnumeric = MatProductNumeric_H2OPUS; 500 PetscFunctionReturn(0); 501 } 502 503 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) 504 { 505 PetscFunctionBegin; 506 MatCheckProduct(C,1); 507 if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) { 508 C->ops->productsymbolic = MatProductSymbolic_H2OPUS; 509 } 510 PetscFunctionReturn(0); 511 } 512 513 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) 514 { 515 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 516 #if defined(H2OPUS_USE_MPI) 517 h2opusHandle_t handle = h2opus->handle->handle; 518 #else 519 h2opusHandle_t handle = h2opus->handle; 520 #endif 521 PetscBool boundtocpu = PETSC_TRUE; 522 PetscInt n; 523 PetscScalar *xx,*yy,*uxx,*uyy; 524 PetscMPIInt size; 525 PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 526 527 PetscFunctionBegin; 528 HLibProfile::clear(); 529 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 530 #if defined(PETSC_H2OPUS_USE_GPU) 531 boundtocpu = A->boundtocpu; 532 #endif 533 if (usesf) { 534 PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 535 } else n = A->rmap->n; 536 if (boundtocpu) { 537 PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx)); 538 if (sy == 0.0) { 539 PetscCall(VecGetArrayWrite(y,&yy)); 540 } else { 541 PetscCall(VecGetArray(y,&yy)); 542 } 543 if (usesf) { 544 uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 545 uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 546 547 PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 548 PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 549 if (sy != 0.0) { 550 PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 551 PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 552 } 553 } else { 554 uxx = xx; 555 uyy = yy; 556 } 557 if (size > 1) { 558 PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 559 PetscCheckFalse(trans && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 560 #if defined(H2OPUS_USE_MPI) 561 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle); 562 #endif 563 } else { 564 PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 565 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle); 566 } 567 PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx)); 568 if (usesf) { 569 PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 570 PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 571 } 572 if (sy == 0.0) { 573 PetscCall(VecRestoreArrayWrite(y,&yy)); 574 } else { 575 PetscCall(VecRestoreArray(y,&yy)); 576 } 577 #if defined(PETSC_H2OPUS_USE_GPU) 578 } else { 579 PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx)); 580 if (sy == 0.0) { 581 PetscCall(VecCUDAGetArrayWrite(y,&yy)); 582 } else { 583 PetscCall(VecCUDAGetArray(y,&yy)); 584 } 585 if (usesf) { 586 uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 587 uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 588 589 PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 590 PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 591 if (sy != 0.0) { 592 PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 593 PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 594 } 595 } else { 596 uxx = xx; 597 uyy = yy; 598 } 599 PetscCall(PetscLogGpuTimeBegin()); 600 if (size > 1) { 601 PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix"); 602 PetscCheckFalse(trans && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 603 #if defined(H2OPUS_USE_MPI) 604 distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle); 605 #endif 606 } else { 607 PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 608 hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle); 609 } 610 PetscCall(PetscLogGpuTimeEnd()); 611 PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx)); 612 if (usesf) { 613 PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 614 PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 615 } 616 if (sy == 0.0) { 617 PetscCall(VecCUDARestoreArrayWrite(y,&yy)); 618 } else { 619 PetscCall(VecCUDARestoreArray(y,&yy)); 620 } 621 #endif 622 } 623 { /* log flops */ 624 double gops,time,perf,dev; 625 HLibProfile::getHgemvPerf(gops,time,perf,dev); 626 #if defined(PETSC_H2OPUS_USE_GPU) 627 if (boundtocpu) { 628 PetscCall(PetscLogFlops(1e9*gops)); 629 } else { 630 PetscCall(PetscLogGpuFlops(1e9*gops)); 631 } 632 #else 633 PetscCall(PetscLogFlops(1e9*gops)); 634 #endif 635 } 636 PetscFunctionReturn(0); 637 } 638 639 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) 640 { 641 PetscBool xiscuda,yiscuda; 642 643 PetscFunctionBegin; 644 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 645 PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"")); 646 PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda)); 647 PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE)); 648 PetscFunctionReturn(0); 649 } 650 651 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) 652 { 653 PetscBool xiscuda,yiscuda; 654 655 PetscFunctionBegin; 656 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 657 PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"")); 658 PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda)); 659 PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE)); 660 PetscFunctionReturn(0); 661 } 662 663 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 664 { 665 PetscBool xiscuda,ziscuda; 666 667 PetscFunctionBegin; 668 PetscCall(VecCopy(y,z)); 669 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 670 PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"")); 671 PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda)); 672 PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE)); 673 PetscFunctionReturn(0); 674 } 675 676 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 677 { 678 PetscBool xiscuda,ziscuda; 679 680 PetscFunctionBegin; 681 PetscCall(VecCopy(y,z)); 682 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 683 PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"")); 684 PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda)); 685 PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE)); 686 PetscFunctionReturn(0); 687 } 688 689 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) 690 { 691 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 692 693 PetscFunctionBegin; 694 a->s *= s; 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A) 699 { 700 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 701 702 PetscFunctionBegin; 703 PetscCall(PetscOptionsHead(PetscOptionsObject,"H2OPUS options")); 704 PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL)); 705 PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL)); 706 PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL)); 707 PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL)); 708 PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL)); 709 PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL)); 710 PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL)); 711 PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL)); 712 PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL)); 713 PetscCall(PetscOptionsTail()); 714 PetscFunctionReturn(0); 715 } 716 717 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*); 718 719 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) 720 { 721 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 722 Vec c; 723 PetscInt spacedim; 724 const PetscScalar *coords; 725 726 PetscFunctionBegin; 727 if (a->ptcloud) PetscFunctionReturn(0); 728 PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c)); 729 if (!c && a->sampler) { 730 Mat S = a->sampler->GetSamplingMat(); 731 732 PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c)); 733 } 734 if (!c) { 735 PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL)); 736 } else { 737 PetscCall(VecGetArrayRead(c,&coords)); 738 PetscCall(VecGetBlockSize(c,&spacedim)); 739 PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL)); 740 PetscCall(VecRestoreArrayRead(c,&coords)); 741 } 742 PetscFunctionReturn(0); 743 } 744 745 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) 746 { 747 MPI_Comm comm; 748 PetscMPIInt size; 749 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 750 PetscInt n = 0,*idx = NULL; 751 int *iidx = NULL; 752 PetscCopyMode own; 753 PetscBool rid; 754 755 PetscFunctionBegin; 756 if (a->multsetup) PetscFunctionReturn(0); 757 if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */ 758 PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL)); 759 #if defined(PETSC_H2OPUS_USE_GPU) 760 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 761 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 762 a->xxs_gpu = 1; 763 a->yys_gpu = 1; 764 #endif 765 a->xx = new thrust::host_vector<PetscScalar>(n); 766 a->yy = new thrust::host_vector<PetscScalar>(n); 767 a->xxs = 1; 768 a->yys = 1; 769 } else { 770 IS is; 771 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 772 PetscCallMPI(MPI_Comm_size(comm,&size)); 773 if (!a->h2opus_indexmap) { 774 if (size > 1) { 775 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 776 #if defined(H2OPUS_USE_MPI) 777 iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map); 778 n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size(); 779 #endif 780 } else { 781 iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map); 782 n = a->hmatrix->u_basis_tree.index_map.size(); 783 } 784 785 if (PetscDefined(USE_64BIT_INDICES)) { 786 PetscInt i; 787 788 own = PETSC_OWN_POINTER; 789 PetscCall(PetscMalloc1(n,&idx)); 790 for (i=0;i<n;i++) idx[i] = iidx[i]; 791 } else { 792 own = PETSC_COPY_VALUES; 793 idx = (PetscInt*)iidx; 794 } 795 PetscCall(ISCreateGeneral(comm,n,idx,own,&is)); 796 PetscCall(ISSetPermutation(is)); 797 PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view")); 798 a->h2opus_indexmap = is; 799 } 800 PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n)); 801 PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx)); 802 rid = (PetscBool)(n == A->rmap->n); 803 PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm)); 804 if (rid) { 805 PetscCall(ISIdentity(a->h2opus_indexmap,&rid)); 806 } 807 if (!rid) { 808 if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */ 809 PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap)); 810 PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n)); 811 PetscCall(PetscLayoutSetUp(a->h2opus_rmap)); 812 PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap)); 813 } 814 PetscCall(PetscSFCreate(comm,&a->sf)); 815 PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx)); 816 PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view")); 817 #if defined(PETSC_H2OPUS_USE_GPU) 818 a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 819 a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 820 a->xxs_gpu = 1; 821 a->yys_gpu = 1; 822 #endif 823 a->xx = new thrust::host_vector<PetscScalar>(n); 824 a->yy = new thrust::host_vector<PetscScalar>(n); 825 a->xxs = 1; 826 a->yys = 1; 827 } 828 PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx)); 829 } 830 a->multsetup = PETSC_TRUE; 831 PetscFunctionReturn(0); 832 } 833 834 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) 835 { 836 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 837 #if defined(H2OPUS_USE_MPI) 838 h2opusHandle_t handle = a->handle->handle; 839 #else 840 h2opusHandle_t handle = a->handle; 841 #endif 842 PetscBool kernel = PETSC_FALSE; 843 PetscBool boundtocpu = PETSC_TRUE; 844 PetscBool samplingdone = PETSC_FALSE; 845 MPI_Comm comm; 846 PetscMPIInt size; 847 848 PetscFunctionBegin; 849 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 850 PetscCheckFalse(A->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 851 PetscCheckFalse(A->rmap->N != A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 852 853 /* XXX */ 854 a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N)); 855 856 PetscCallMPI(MPI_Comm_size(comm,&size)); 857 /* TODO REUSABILITY of geometric construction */ 858 delete a->hmatrix; 859 delete a->dist_hmatrix; 860 #if defined(PETSC_H2OPUS_USE_GPU) 861 delete a->hmatrix_gpu; 862 delete a->dist_hmatrix_gpu; 863 #endif 864 a->orthogonal = PETSC_FALSE; 865 866 /* TODO: other? */ 867 H2OpusBoxCenterAdmissibility adm(a->eta); 868 869 PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0)); 870 if (size > 1) { 871 #if defined(H2OPUS_USE_MPI) 872 a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */); 873 #else 874 a->dist_hmatrix = NULL; 875 #endif 876 } else { 877 a->hmatrix = new HMatrix(A->rmap->n,A->symmetric); 878 } 879 PetscCall(MatH2OpusInferCoordinates_Private(A)); 880 PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud"); 881 if (a->kernel) { 882 BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel); 883 if (size > 1) { 884 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 885 #if defined(H2OPUS_USE_MPI) 886 buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle); 887 #endif 888 } else { 889 buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord); 890 } 891 kernel = PETSC_TRUE; 892 } else { 893 PetscCheckFalse(size > 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel"); 894 buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm); 895 } 896 PetscCall(MatSetUpMultiply_H2OPUS(A)); 897 898 #if defined(PETSC_H2OPUS_USE_GPU) 899 boundtocpu = A->boundtocpu; 900 if (!boundtocpu) { 901 if (size > 1) { 902 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 903 #if defined(H2OPUS_USE_MPI) 904 a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 905 #endif 906 } else { 907 a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 908 } 909 } 910 #endif 911 if (size == 1) { 912 if (!kernel && a->sampler && a->sampler->GetSamplingMat()) { 913 PetscReal Anorm; 914 bool verbose; 915 916 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL)); 917 verbose = a->hara_verbose; 918 PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm)); 919 if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs)); 920 if (a->sf && !a->nativemult) { 921 a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data()); 922 } 923 a->sampler->SetStream(handle->getMainStream()); 924 if (boundtocpu) { 925 a->sampler->SetGPUSampling(false); 926 hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose); 927 #if defined(PETSC_H2OPUS_USE_GPU) 928 } else { 929 a->sampler->SetGPUSampling(true); 930 hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose); 931 #endif 932 } 933 samplingdone = PETSC_TRUE; 934 } 935 } 936 #if defined(PETSC_H2OPUS_USE_GPU) 937 if (!boundtocpu) { 938 delete a->hmatrix; 939 delete a->dist_hmatrix; 940 a->hmatrix = NULL; 941 a->dist_hmatrix = NULL; 942 } 943 A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 944 #endif 945 PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0)); 946 947 if (!a->s) a->s = 1.0; 948 A->assembled = PETSC_TRUE; 949 950 if (samplingdone) { 951 PetscBool check = a->check_construction; 952 PetscBool checke = PETSC_FALSE; 953 954 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL)); 955 PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL)); 956 if (check) { 957 Mat E,Ae; 958 PetscReal n1,ni,n2; 959 PetscReal n1A,niA,n2A; 960 void (*normfunc)(void); 961 962 Ae = a->sampler->GetSamplingMat(); 963 PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E)); 964 PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS)); 965 PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN)); 966 PetscCall(MatNorm(E,NORM_1,&n1)); 967 PetscCall(MatNorm(E,NORM_INFINITY,&ni)); 968 PetscCall(MatNorm(E,NORM_2,&n2)); 969 if (checke) { 970 Mat eA,eE,eAe; 971 972 PetscCall(MatComputeOperator(A,MATAIJ,&eA)); 973 PetscCall(MatComputeOperator(E,MATAIJ,&eE)); 974 PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe)); 975 PetscCall(MatChop(eA,PETSC_SMALL)); 976 PetscCall(MatChop(eE,PETSC_SMALL)); 977 PetscCall(MatChop(eAe,PETSC_SMALL)); 978 PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat")); 979 PetscCall(MatView(eA,NULL)); 980 PetscCall(PetscObjectSetName((PetscObject)eAe,"S")); 981 PetscCall(MatView(eAe,NULL)); 982 PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S")); 983 PetscCall(MatView(eE,NULL)); 984 PetscCall(MatDestroy(&eA)); 985 PetscCall(MatDestroy(&eE)); 986 PetscCall(MatDestroy(&eAe)); 987 } 988 989 PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc)); 990 PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS)); 991 PetscCall(MatNorm(Ae,NORM_1,&n1A)); 992 PetscCall(MatNorm(Ae,NORM_INFINITY,&niA)); 993 PetscCall(MatNorm(Ae,NORM_2,&n2A)); 994 n1A = PetscMax(n1A,PETSC_SMALL); 995 n2A = PetscMax(n2A,PETSC_SMALL); 996 niA = PetscMax(niA,PETSC_SMALL); 997 PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc)); 998 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A))); 999 PetscCall(MatDestroy(&E)); 1000 } 1001 a->sampler->SetSamplingMat(NULL); 1002 } 1003 PetscFunctionReturn(0); 1004 } 1005 1006 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) 1007 { 1008 PetscMPIInt size; 1009 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1010 1011 PetscFunctionBegin; 1012 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1013 PetscCheckFalse(size > 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported"); 1014 else { 1015 a->hmatrix->clearData(); 1016 #if defined(PETSC_H2OPUS_USE_GPU) 1017 if (a->hmatrix_gpu) a->hmatrix_gpu->clearData(); 1018 #endif 1019 } 1020 PetscFunctionReturn(0); 1021 } 1022 1023 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) 1024 { 1025 Mat A; 1026 Mat_H2OPUS *a, *b = (Mat_H2OPUS*)B->data; 1027 #if defined(PETSC_H2OPUS_USE_GPU) 1028 PetscBool iscpu = PETSC_FALSE; 1029 #else 1030 PetscBool iscpu = PETSC_TRUE; 1031 #endif 1032 MPI_Comm comm; 1033 1034 PetscFunctionBegin; 1035 PetscCall(PetscObjectGetComm((PetscObject)B,&comm)); 1036 PetscCall(MatCreate(comm,&A)); 1037 PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N)); 1038 PetscCall(MatSetType(A,MATH2OPUS)); 1039 PetscCall(MatPropagateSymmetryOptions(B,A)); 1040 a = (Mat_H2OPUS*)A->data; 1041 1042 a->eta = b->eta; 1043 a->leafsize = b->leafsize; 1044 a->basisord = b->basisord; 1045 a->max_rank = b->max_rank; 1046 a->bs = b->bs; 1047 a->rtol = b->rtol; 1048 a->norm_max_samples = b->norm_max_samples; 1049 if (op == MAT_COPY_VALUES) a->s = b->s; 1050 1051 a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud); 1052 if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel); 1053 1054 #if defined(H2OPUS_USE_MPI) 1055 if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); } 1056 #if defined(PETSC_H2OPUS_USE_GPU) 1057 if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); } 1058 #endif 1059 #endif 1060 if (b->hmatrix) { 1061 a->hmatrix = new HMatrix(*b->hmatrix); 1062 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData(); 1063 } 1064 #if defined(PETSC_H2OPUS_USE_GPU) 1065 if (b->hmatrix_gpu) { 1066 a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu); 1067 if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData(); 1068 } 1069 #endif 1070 if (b->sf) { 1071 PetscCall(PetscObjectReference((PetscObject)b->sf)); 1072 a->sf = b->sf; 1073 } 1074 if (b->h2opus_indexmap) { 1075 PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap)); 1076 a->h2opus_indexmap = b->h2opus_indexmap; 1077 } 1078 1079 PetscCall(MatSetUp(A)); 1080 PetscCall(MatSetUpMultiply_H2OPUS(A)); 1081 if (op == MAT_COPY_VALUES) { 1082 A->assembled = PETSC_TRUE; 1083 a->orthogonal = b->orthogonal; 1084 #if defined(PETSC_H2OPUS_USE_GPU) 1085 A->offloadmask = B->offloadmask; 1086 #endif 1087 } 1088 #if defined(PETSC_H2OPUS_USE_GPU) 1089 iscpu = B->boundtocpu; 1090 #endif 1091 PetscCall(MatBindToCPU(A,iscpu)); 1092 1093 *nA = A; 1094 PetscFunctionReturn(0); 1095 } 1096 1097 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) 1098 { 1099 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 1100 PetscBool isascii; 1101 PetscMPIInt size; 1102 PetscViewerFormat format; 1103 1104 PetscFunctionBegin; 1105 PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii)); 1106 PetscCall(PetscViewerGetFormat(view,&format)); 1107 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1108 if (isascii) { 1109 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1110 if (size == 1) { 1111 FILE *fp; 1112 PetscCall(PetscViewerASCIIGetPointer(view,&fp)); 1113 dumpHMatrix(*h2opus->hmatrix,6,fp); 1114 } 1115 } else { 1116 PetscCall(PetscViewerASCIIPrintf(view," H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat")); 1117 PetscCall(PetscViewerASCIIPrintf(view," PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0)); 1118 PetscCall(PetscViewerASCIIPrintf(view," Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta)); 1119 if (!h2opus->kernel) { 1120 PetscCall(PetscViewerASCIIPrintf(view," Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol)); 1121 } else { 1122 PetscCall(PetscViewerASCIIPrintf(view," Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord)); 1123 } 1124 PetscCall(PetscViewerASCIIPrintf(view," Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples)); 1125 if (size == 1) { 1126 double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0; 1127 double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0; 1128 #if defined(PETSC_HAVE_CUDA) 1129 double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0; 1130 double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0; 1131 #endif 1132 PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu)); 1133 #if defined(PETSC_HAVE_CUDA) 1134 PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu)); 1135 #endif 1136 } else { 1137 #if defined(PETSC_HAVE_CUDA) 1138 double matrix_mem[4] = {0.,0.,0.,0.}; 1139 PetscMPIInt rsize = 4; 1140 #else 1141 double matrix_mem[2] = {0.,0.}; 1142 PetscMPIInt rsize = 2; 1143 #endif 1144 #if defined(H2OPUS_USE_MPI) 1145 matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0; 1146 matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0; 1147 #if defined(PETSC_HAVE_CUDA) 1148 matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0; 1149 matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0; 1150 #endif 1151 #endif 1152 PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A))); 1153 PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1])); 1154 #if defined(PETSC_HAVE_CUDA) 1155 PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3])); 1156 #endif 1157 } 1158 } 1159 } 1160 #if 0 1161 if (size == 1) { 1162 char filename[256]; 1163 const char *name; 1164 1165 PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1166 PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name)); 1167 outputEps(*h2opus->hmatrix,filename); 1168 } 1169 #endif 1170 PetscFunctionReturn(0); 1171 } 1172 1173 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx) 1174 { 1175 Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 1176 PetscReal *gcoords; 1177 PetscInt N; 1178 MPI_Comm comm; 1179 PetscMPIInt size; 1180 PetscBool cong; 1181 1182 PetscFunctionBegin; 1183 PetscCall(PetscLayoutSetUp(A->rmap)); 1184 PetscCall(PetscLayoutSetUp(A->cmap)); 1185 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 1186 PetscCall(MatHasCongruentLayouts(A,&cong)); 1187 PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts"); 1188 N = A->rmap->N; 1189 PetscCallMPI(MPI_Comm_size(comm,&size)); 1190 if (spacedim > 0 && size > 1 && cdist) { 1191 PetscSF sf; 1192 MPI_Datatype dtype; 1193 1194 PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype)); 1195 PetscCallMPI(MPI_Type_commit(&dtype)); 1196 1197 PetscCall(PetscSFCreate(comm,&sf)); 1198 PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER)); 1199 PetscCall(PetscMalloc1(spacedim*N,&gcoords)); 1200 PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE)); 1201 PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE)); 1202 PetscCall(PetscSFDestroy(&sf)); 1203 PetscCallMPI(MPI_Type_free(&dtype)); 1204 } else gcoords = (PetscReal*)coords; 1205 1206 delete h2opus->ptcloud; 1207 delete h2opus->kernel; 1208 h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords); 1209 if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx); 1210 if (gcoords != coords) PetscCall(PetscFree(gcoords)); 1211 A->preallocated = PETSC_TRUE; 1212 PetscFunctionReturn(0); 1213 } 1214 1215 #if defined(PETSC_H2OPUS_USE_GPU) 1216 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) 1217 { 1218 PetscMPIInt size; 1219 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1220 1221 PetscFunctionBegin; 1222 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1223 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) { 1224 if (size > 1) { 1225 PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1226 #if defined(H2OPUS_USE_MPI) 1227 if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu); 1228 else *a->dist_hmatrix = *a->dist_hmatrix_gpu; 1229 #endif 1230 } else { 1231 PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1232 if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu); 1233 else *a->hmatrix = *a->hmatrix_gpu; 1234 } 1235 delete a->hmatrix_gpu; 1236 delete a->dist_hmatrix_gpu; 1237 a->hmatrix_gpu = NULL; 1238 a->dist_hmatrix_gpu = NULL; 1239 A->offloadmask = PETSC_OFFLOAD_CPU; 1240 } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) { 1241 if (size > 1) { 1242 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1243 #if defined(H2OPUS_USE_MPI) 1244 if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 1245 else *a->dist_hmatrix_gpu = *a->dist_hmatrix; 1246 #endif 1247 } else { 1248 PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1249 if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 1250 else *a->hmatrix_gpu = *a->hmatrix; 1251 } 1252 delete a->hmatrix; 1253 delete a->dist_hmatrix; 1254 a->hmatrix = NULL; 1255 a->dist_hmatrix = NULL; 1256 A->offloadmask = PETSC_OFFLOAD_GPU; 1257 } 1258 PetscCall(PetscFree(A->defaultvectype)); 1259 if (!flg) { 1260 PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 1261 } else { 1262 PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype)); 1263 } 1264 A->boundtocpu = flg; 1265 PetscFunctionReturn(0); 1266 } 1267 #endif 1268 1269 /*MC 1270 MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package. 1271 1272 Options Database Keys: 1273 . -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions() 1274 1275 Notes: 1276 H2Opus implements hierarchical matrices in the H^2 flavour. 1277 It supports CPU or NVIDIA GPUs. 1278 For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus. 1279 In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas. 1280 For details and additional references, see 1281 "H2Opus: A distributed-memory multi-GPU software package for non-local operators", 1282 available at https://arxiv.org/abs/2109.05451. 1283 1284 Level: beginner 1285 1286 .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat() 1287 M*/ 1288 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) 1289 { 1290 Mat_H2OPUS *a; 1291 PetscMPIInt size; 1292 1293 PetscFunctionBegin; 1294 #if defined(PETSC_H2OPUS_USE_GPU) 1295 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1296 #endif 1297 PetscCall(PetscNewLog(A,&a)); 1298 A->data = (void*)a; 1299 1300 a->eta = 0.9; 1301 a->leafsize = 32; 1302 a->basisord = 4; 1303 a->max_rank = 64; 1304 a->bs = 32; 1305 a->rtol = 1.e-4; 1306 a->s = 1.0; 1307 a->norm_max_samples = 10; 1308 #if defined(H2OPUS_USE_MPI) 1309 h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A)); 1310 #else 1311 h2opusCreateHandle(&a->handle); 1312 #endif 1313 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1314 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS)); 1315 PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps))); 1316 1317 A->ops->destroy = MatDestroy_H2OPUS; 1318 A->ops->view = MatView_H2OPUS; 1319 A->ops->assemblyend = MatAssemblyEnd_H2OPUS; 1320 A->ops->mult = MatMult_H2OPUS; 1321 A->ops->multtranspose = MatMultTranspose_H2OPUS; 1322 A->ops->multadd = MatMultAdd_H2OPUS; 1323 A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS; 1324 A->ops->scale = MatScale_H2OPUS; 1325 A->ops->duplicate = MatDuplicate_H2OPUS; 1326 A->ops->setfromoptions = MatSetFromOptions_H2OPUS; 1327 A->ops->norm = MatNorm_H2OPUS; 1328 A->ops->zeroentries = MatZeroEntries_H2OPUS; 1329 #if defined(PETSC_H2OPUS_USE_GPU) 1330 A->ops->bindtocpu = MatBindToCPU_H2OPUS; 1331 #endif 1332 1333 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS)); 1334 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS)); 1335 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS)); 1336 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS)); 1337 #if defined(PETSC_H2OPUS_USE_GPU) 1338 PetscCall(PetscFree(A->defaultvectype)); 1339 PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 1340 #endif 1341 PetscFunctionReturn(0); 1342 } 1343 1344 /*@C 1345 MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix. 1346 1347 Input Parameter: 1348 . A - the matrix 1349 1350 Level: intermediate 1351 1352 .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress() 1353 @*/ 1354 PetscErrorCode MatH2OpusOrthogonalize(Mat A) 1355 { 1356 PetscBool ish2opus; 1357 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1358 PetscMPIInt size; 1359 PetscBool boundtocpu = PETSC_TRUE; 1360 1361 PetscFunctionBegin; 1362 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1363 PetscValidType(A,1); 1364 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1365 if (!ish2opus) PetscFunctionReturn(0); 1366 if (a->orthogonal) PetscFunctionReturn(0); 1367 HLibProfile::clear(); 1368 PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0)); 1369 #if defined(PETSC_H2OPUS_USE_GPU) 1370 boundtocpu = A->boundtocpu; 1371 #endif 1372 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1373 if (size > 1) { 1374 if (boundtocpu) { 1375 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1376 #if defined(H2OPUS_USE_MPI) 1377 distributed_horthog(*a->dist_hmatrix, a->handle); 1378 #endif 1379 #if defined(PETSC_H2OPUS_USE_GPU) 1380 A->offloadmask = PETSC_OFFLOAD_CPU; 1381 } else { 1382 PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1383 PetscCall(PetscLogGpuTimeBegin()); 1384 #if defined(H2OPUS_USE_MPI) 1385 distributed_horthog(*a->dist_hmatrix_gpu, a->handle); 1386 #endif 1387 PetscCall(PetscLogGpuTimeEnd()); 1388 #endif 1389 } 1390 } else { 1391 #if defined(H2OPUS_USE_MPI) 1392 h2opusHandle_t handle = a->handle->handle; 1393 #else 1394 h2opusHandle_t handle = a->handle; 1395 #endif 1396 if (boundtocpu) { 1397 PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1398 horthog(*a->hmatrix, handle); 1399 #if defined(PETSC_H2OPUS_USE_GPU) 1400 A->offloadmask = PETSC_OFFLOAD_CPU; 1401 } else { 1402 PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1403 PetscCall(PetscLogGpuTimeBegin()); 1404 horthog(*a->hmatrix_gpu, handle); 1405 PetscCall(PetscLogGpuTimeEnd()); 1406 #endif 1407 } 1408 } 1409 a->orthogonal = PETSC_TRUE; 1410 { /* log flops */ 1411 double gops,time,perf,dev; 1412 HLibProfile::getHorthogPerf(gops,time,perf,dev); 1413 #if defined(PETSC_H2OPUS_USE_GPU) 1414 if (boundtocpu) { 1415 PetscCall(PetscLogFlops(1e9*gops)); 1416 } else { 1417 PetscCall(PetscLogGpuFlops(1e9*gops)); 1418 } 1419 #else 1420 PetscCall(PetscLogFlops(1e9*gops)); 1421 #endif 1422 } 1423 PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0)); 1424 PetscFunctionReturn(0); 1425 } 1426 1427 /*@C 1428 MatH2OpusCompress - Compress a hierarchical matrix. 1429 1430 Input Parameters: 1431 + A - the matrix 1432 - tol - the absolute truncation threshold 1433 1434 Level: intermediate 1435 1436 .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize() 1437 @*/ 1438 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) 1439 { 1440 PetscBool ish2opus; 1441 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1442 PetscMPIInt size; 1443 PetscBool boundtocpu = PETSC_TRUE; 1444 1445 PetscFunctionBegin; 1446 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1447 PetscValidType(A,1); 1448 PetscValidLogicalCollectiveReal(A,tol,2); 1449 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1450 if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0); 1451 PetscCall(MatH2OpusOrthogonalize(A)); 1452 HLibProfile::clear(); 1453 PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0)); 1454 #if defined(PETSC_H2OPUS_USE_GPU) 1455 boundtocpu = A->boundtocpu; 1456 #endif 1457 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1458 if (size > 1) { 1459 if (boundtocpu) { 1460 PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1461 #if defined(H2OPUS_USE_MPI) 1462 distributed_hcompress(*a->dist_hmatrix, tol, a->handle); 1463 #endif 1464 #if defined(PETSC_H2OPUS_USE_GPU) 1465 A->offloadmask = PETSC_OFFLOAD_CPU; 1466 } else { 1467 PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1468 PetscCall(PetscLogGpuTimeBegin()); 1469 #if defined(H2OPUS_USE_MPI) 1470 distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle); 1471 #endif 1472 PetscCall(PetscLogGpuTimeEnd()); 1473 #endif 1474 } 1475 } else { 1476 #if defined(H2OPUS_USE_MPI) 1477 h2opusHandle_t handle = a->handle->handle; 1478 #else 1479 h2opusHandle_t handle = a->handle; 1480 #endif 1481 if (boundtocpu) { 1482 PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1483 hcompress(*a->hmatrix, tol, handle); 1484 #if defined(PETSC_H2OPUS_USE_GPU) 1485 A->offloadmask = PETSC_OFFLOAD_CPU; 1486 } else { 1487 PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1488 PetscCall(PetscLogGpuTimeBegin()); 1489 hcompress(*a->hmatrix_gpu, tol, handle); 1490 PetscCall(PetscLogGpuTimeEnd()); 1491 #endif 1492 } 1493 } 1494 { /* log flops */ 1495 double gops,time,perf,dev; 1496 HLibProfile::getHcompressPerf(gops,time,perf,dev); 1497 #if defined(PETSC_H2OPUS_USE_GPU) 1498 if (boundtocpu) { 1499 PetscCall(PetscLogFlops(1e9*gops)); 1500 } else { 1501 PetscCall(PetscLogGpuFlops(1e9*gops)); 1502 } 1503 #else 1504 PetscCall(PetscLogFlops(1e9*gops)); 1505 #endif 1506 } 1507 PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0)); 1508 PetscFunctionReturn(0); 1509 } 1510 1511 /*@C 1512 MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix. 1513 1514 Input Parameters: 1515 + A - the hierarchical matrix 1516 . B - the matrix to be sampled 1517 . bs - maximum number of samples to be taken concurrently 1518 - tol - relative tolerance for construction 1519 1520 Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix. 1521 1522 Level: intermediate 1523 1524 .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize() 1525 @*/ 1526 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) 1527 { 1528 PetscBool ish2opus; 1529 1530 PetscFunctionBegin; 1531 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1532 PetscValidType(A,1); 1533 if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2); 1534 PetscValidLogicalCollectiveInt(A,bs,3); 1535 PetscValidLogicalCollectiveReal(A,tol,3); 1536 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1537 if (ish2opus) { 1538 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1539 1540 if (!a->sampler) a->sampler = new PetscMatrixSampler(); 1541 a->sampler->SetSamplingMat(B); 1542 if (bs > 0) a->bs = bs; 1543 if (tol > 0.) a->rtol = tol; 1544 delete a->kernel; 1545 } 1546 PetscFunctionReturn(0); 1547 } 1548 1549 /*@C 1550 MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel. 1551 1552 Input Parameters: 1553 + comm - MPI communicator 1554 . m - number of local rows (or PETSC_DECIDE to have calculated if M is given) 1555 . n - number of local columns (or PETSC_DECIDE to have calculated if N is given) 1556 . M - number of global rows (or PETSC_DETERMINE to have calculated if m is given) 1557 . N - number of global columns (or PETSC_DETERMINE to have calculated if n is given) 1558 . spacedim - dimension of the space coordinates 1559 . coords - coordinates of the points 1560 . cdist - whether or not coordinates are distributed 1561 . kernel - computational kernel (or NULL) 1562 . kernelctx - kernel context 1563 . eta - admissibility condition tolerance 1564 . leafsize - leaf size in cluster tree 1565 - basisord - approximation order for Chebychev interpolation of low-rank blocks 1566 1567 Output Parameter: 1568 . nA - matrix 1569 1570 Options Database Keys: 1571 + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree 1572 . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance 1573 . -mat_h2opus_order <PetscInt> - Chebychev approximation order 1574 - -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms 1575 1576 Level: intermediate 1577 1578 .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat() 1579 @*/ 1580 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA) 1581 { 1582 Mat A; 1583 Mat_H2OPUS *h2opus; 1584 #if defined(PETSC_H2OPUS_USE_GPU) 1585 PetscBool iscpu = PETSC_FALSE; 1586 #else 1587 PetscBool iscpu = PETSC_TRUE; 1588 #endif 1589 1590 PetscFunctionBegin; 1591 PetscCheckFalse(m != n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 1592 PetscCall(MatCreate(comm,&A)); 1593 PetscCall(MatSetSizes(A,m,n,M,N)); 1594 PetscCheckFalse(M != N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 1595 PetscCall(MatSetType(A,MATH2OPUS)); 1596 PetscCall(MatBindToCPU(A,iscpu)); 1597 PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx)); 1598 1599 h2opus = (Mat_H2OPUS*)A->data; 1600 if (eta > 0.) h2opus->eta = eta; 1601 if (leafsize > 0) h2opus->leafsize = leafsize; 1602 if (basisord > 0) h2opus->basisord = basisord; 1603 1604 *nA = A; 1605 PetscFunctionReturn(0); 1606 } 1607 1608 /*@C 1609 MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator. 1610 1611 Input Parameters: 1612 + B - the matrix to be sampled 1613 . spacedim - dimension of the space coordinates 1614 . coords - coordinates of the points 1615 . cdist - whether or not coordinates are distributed 1616 . eta - admissibility condition tolerance 1617 . leafsize - leaf size in cluster tree 1618 . maxrank - maximum rank allowed 1619 . bs - maximum number of samples to be taken concurrently 1620 - rtol - relative tolerance for construction 1621 1622 Output Parameter: 1623 . nA - matrix 1624 1625 Options Database Keys: 1626 + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree 1627 . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance 1628 . -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs 1629 . -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs 1630 . -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling 1631 . -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd() 1632 . -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction 1633 - -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms 1634 1635 Notes: not available in parallel 1636 1637 Level: intermediate 1638 1639 .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel() 1640 @*/ 1641 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) 1642 { 1643 Mat A; 1644 Mat_H2OPUS *h2opus; 1645 MPI_Comm comm; 1646 PetscBool boundtocpu = PETSC_TRUE; 1647 1648 PetscFunctionBegin; 1649 PetscValidHeaderSpecific(B,MAT_CLASSID,1); 1650 PetscValidLogicalCollectiveInt(B,spacedim,2); 1651 PetscValidLogicalCollectiveBool(B,cdist,4); 1652 PetscValidLogicalCollectiveReal(B,eta,5); 1653 PetscValidLogicalCollectiveInt(B,leafsize,6); 1654 PetscValidLogicalCollectiveInt(B,maxrank,7); 1655 PetscValidLogicalCollectiveInt(B,bs,8); 1656 PetscValidLogicalCollectiveReal(B,rtol,9); 1657 PetscValidPointer(nA,10); 1658 PetscCall(PetscObjectGetComm((PetscObject)B,&comm)); 1659 PetscCheckFalse(B->rmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 1660 PetscCheckFalse(B->rmap->N != B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 1661 PetscCall(MatCreate(comm,&A)); 1662 PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N)); 1663 #if defined(PETSC_H2OPUS_USE_GPU) 1664 { 1665 PetscBool iscuda; 1666 VecType vtype; 1667 1668 PetscCall(MatGetVecType(B,&vtype)); 1669 PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda)); 1670 if (!iscuda) { 1671 PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda)); 1672 if (!iscuda) { 1673 PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda)); 1674 } 1675 } 1676 if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE; 1677 } 1678 #endif 1679 PetscCall(MatSetType(A,MATH2OPUS)); 1680 PetscCall(MatBindToCPU(A,boundtocpu)); 1681 if (spacedim) { 1682 PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL)); 1683 } 1684 PetscCall(MatPropagateSymmetryOptions(B,A)); 1685 /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */ 1686 1687 h2opus = (Mat_H2OPUS*)A->data; 1688 h2opus->sampler = new PetscMatrixSampler(B); 1689 if (eta > 0.) h2opus->eta = eta; 1690 if (leafsize > 0) h2opus->leafsize = leafsize; 1691 if (maxrank > 0) h2opus->max_rank = maxrank; 1692 if (bs > 0) h2opus->bs = bs; 1693 if (rtol > 0.) h2opus->rtol = rtol; 1694 *nA = A; 1695 A->preallocated = PETSC_TRUE; 1696 PetscFunctionReturn(0); 1697 } 1698 1699 /*@C 1700 MatH2OpusGetIndexMap - Access reordering index set. 1701 1702 Input Parameters: 1703 . A - the matrix 1704 1705 Output Parameter: 1706 . indexmap - the index set for the reordering 1707 1708 Level: intermediate 1709 1710 .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel() 1711 @*/ 1712 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) 1713 { 1714 PetscBool ish2opus; 1715 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1716 1717 PetscFunctionBegin; 1718 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1719 PetscValidType(A,1); 1720 PetscValidPointer(indexmap,2); 1721 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1722 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1723 PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 1724 *indexmap = a->h2opus_indexmap; 1725 PetscFunctionReturn(0); 1726 } 1727 1728 /*@C 1729 MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering 1730 1731 Input Parameters: 1732 + A - the matrix 1733 . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map 1734 - in - the vector to be mapped 1735 1736 Output Parameter: 1737 . out - the newly created mapped vector 1738 1739 Level: intermediate 1740 1741 .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel() 1742 @*/ 1743 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out) 1744 { 1745 PetscBool ish2opus; 1746 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1747 PetscScalar *xin,*xout; 1748 PetscBool nm; 1749 1750 PetscFunctionBegin; 1751 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1752 PetscValidType(A,1); 1753 PetscValidLogicalCollectiveBool(A,nativetopetsc,2); 1754 PetscValidHeaderSpecific(in,VEC_CLASSID,3); 1755 PetscValidPointer(out,4); 1756 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1757 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1758 PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 1759 nm = a->nativemult; 1760 PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc)); 1761 PetscCall(MatCreateVecs(A,out,NULL)); 1762 PetscCall(MatH2OpusSetNativeMult(A,nm)); 1763 if (!a->sf) { /* same ordering */ 1764 PetscCall(VecCopy(in,*out)); 1765 PetscFunctionReturn(0); 1766 } 1767 PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin)); 1768 PetscCall(VecGetArrayWrite(*out,&xout)); 1769 if (nativetopetsc) { 1770 PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1771 PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1772 } else { 1773 PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1774 PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1775 } 1776 PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin)); 1777 PetscCall(VecRestoreArrayWrite(*out,&xout)); 1778 PetscFunctionReturn(0); 1779 } 1780 1781 /*@C 1782 MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T 1783 1784 Input Parameters: 1785 + A - the hierarchical matrix 1786 . s - the scaling factor 1787 . U - the dense low-rank update matrix 1788 - V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed) 1789 1790 Notes: The U and V matrices must be in dense format 1791 1792 Level: intermediate 1793 1794 .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize(), MATDENSE 1795 @*/ 1796 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) 1797 { 1798 PetscBool flg; 1799 1800 PetscFunctionBegin; 1801 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1802 PetscValidType(A,1); 1803 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1804 PetscValidHeaderSpecific(U,MAT_CLASSID,2); 1805 PetscCheckSameComm(A,1,U,2); 1806 if (V) { 1807 PetscValidHeaderSpecific(V,MAT_CLASSID,3); 1808 PetscCheckSameComm(A,1,V,3); 1809 } 1810 PetscValidLogicalCollectiveScalar(A,s,4); 1811 1812 if (!V) V = U; 1813 PetscCheckFalse(U->cmap->N != V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N); 1814 if (!U->cmap->N) PetscFunctionReturn(0); 1815 PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg)); 1816 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout"); 1817 PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg)); 1818 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout"); 1819 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg)); 1820 if (flg) { 1821 Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1822 const PetscScalar *u,*v,*uu,*vv; 1823 PetscInt ldu,ldv; 1824 PetscMPIInt size; 1825 #if defined(H2OPUS_USE_MPI) 1826 h2opusHandle_t handle = a->handle->handle; 1827 #else 1828 h2opusHandle_t handle = a->handle; 1829 #endif 1830 PetscBool usesf = (PetscBool)(a->sf && !a->nativemult); 1831 PetscSF usf,vsf; 1832 1833 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1834 PetscCheckFalse(size > 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel"); 1835 PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0)); 1836 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,"")); 1837 PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name); 1838 PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,"")); 1839 PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name); 1840 PetscCall(MatDenseGetLDA(U,&ldu)); 1841 PetscCall(MatDenseGetLDA(V,&ldv)); 1842 PetscCall(MatBoundToCPU(A,&flg)); 1843 if (usesf) { 1844 PetscInt n; 1845 1846 PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf)); 1847 PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf)); 1848 PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N)); 1849 PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL)); 1850 ldu = n; 1851 ldv = n; 1852 } 1853 if (flg) { 1854 PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1855 PetscCall(MatDenseGetArrayRead(U,&u)); 1856 PetscCall(MatDenseGetArrayRead(V,&v)); 1857 if (usesf) { 1858 vv = MatH2OpusGetThrustPointer(*a->yy); 1859 PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1860 PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1861 if (U != V) { 1862 uu = MatH2OpusGetThrustPointer(*a->xx); 1863 PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1864 PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1865 } else uu = vv; 1866 } else { uu = u; vv = v; } 1867 hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle); 1868 PetscCall(MatDenseRestoreArrayRead(U,&u)); 1869 PetscCall(MatDenseRestoreArrayRead(V,&v)); 1870 } else { 1871 #if defined(PETSC_H2OPUS_USE_GPU) 1872 PetscBool flgU, flgV; 1873 1874 PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1875 PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,"")); 1876 if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U)); 1877 PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,"")); 1878 if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V)); 1879 PetscCall(MatDenseCUDAGetArrayRead(U,&u)); 1880 PetscCall(MatDenseCUDAGetArrayRead(V,&v)); 1881 if (usesf) { 1882 vv = MatH2OpusGetThrustPointer(*a->yy_gpu); 1883 PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1884 PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1885 if (U != V) { 1886 uu = MatH2OpusGetThrustPointer(*a->xx_gpu); 1887 PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1888 PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1889 } else uu = vv; 1890 } else { uu = u; vv = v; } 1891 #else 1892 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen"); 1893 #endif 1894 hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle); 1895 #if defined(PETSC_H2OPUS_USE_GPU) 1896 PetscCall(MatDenseCUDARestoreArrayRead(U,&u)); 1897 PetscCall(MatDenseCUDARestoreArrayRead(V,&v)); 1898 if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U)); 1899 if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V)); 1900 #endif 1901 } 1902 PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0)); 1903 a->orthogonal = PETSC_FALSE; 1904 } 1905 PetscFunctionReturn(0); 1906 } 1907 #endif 1908