1 2 /* 3 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4 C = A * B 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <petscbt.h> 10 #include <petsc/private/isimpl.h> 11 #include <../src/mat/impls/dense/seq/dense.h> 12 13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 14 { 15 PetscFunctionBegin; 16 if (C->ops->matmultnumeric) { 17 PetscCheck(C->ops->matmultnumeric != MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call"); 18 PetscCall((*C->ops->matmultnumeric)(A,B,C)); 19 } else { 20 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C)); 21 } 22 PetscFunctionReturn(0); 23 } 24 25 /* Modified from MatCreateSeqAIJWithArrays() */ 26 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat) 27 { 28 PetscInt ii; 29 Mat_SeqAIJ *aij; 30 PetscBool isseqaij, osingle, ofree_a, ofree_ij; 31 32 PetscFunctionBegin; 33 PetscCheck(m <= 0 || !i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0"); 34 PetscCall(MatSetSizes(mat,m,n,m,n)); 35 36 if (!mtype) { 37 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij)); 38 if (!isseqaij) PetscCall(MatSetType(mat,MATSEQAIJ)); 39 } else { 40 PetscCall(MatSetType(mat,mtype)); 41 } 42 43 aij = (Mat_SeqAIJ*)(mat)->data; 44 osingle = aij->singlemalloc; 45 ofree_a = aij->free_a; 46 ofree_ij = aij->free_ij; 47 /* changes the free flags */ 48 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL)); 49 50 PetscCall(PetscFree(aij->ilen)); 51 PetscCall(PetscFree(aij->imax)); 52 PetscCall(PetscMalloc1(m,&aij->imax)); 53 PetscCall(PetscMalloc1(m,&aij->ilen)); 54 for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) { 55 const PetscInt rnz = i[ii+1] - i[ii]; 56 aij->nonzerorowcnt += !!rnz; 57 aij->rmax = PetscMax(aij->rmax,rnz); 58 aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii]; 59 } 60 aij->maxnz = i[m]; 61 aij->nz = i[m]; 62 63 if (osingle) { 64 PetscCall(PetscFree3(aij->a,aij->j,aij->i)); 65 } else { 66 if (ofree_a) PetscCall(PetscFree(aij->a)); 67 if (ofree_ij) PetscCall(PetscFree(aij->j)); 68 if (ofree_ij) PetscCall(PetscFree(aij->i)); 69 } 70 aij->i = i; 71 aij->j = j; 72 aij->a = a; 73 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 74 /* default to not retain ownership */ 75 aij->singlemalloc = PETSC_FALSE; 76 aij->free_a = PETSC_FALSE; 77 aij->free_ij = PETSC_FALSE; 78 PetscCall(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6)); 79 PetscFunctionReturn(0); 80 } 81 82 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 83 { 84 Mat_Product *product = C->product; 85 MatProductAlgorithm alg; 86 PetscBool flg; 87 88 PetscFunctionBegin; 89 if (product) { 90 alg = product->alg; 91 } else { 92 alg = "sorted"; 93 } 94 /* sorted */ 95 PetscCall(PetscStrcmp(alg,"sorted",&flg)); 96 if (flg) { 97 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C)); 98 PetscFunctionReturn(0); 99 } 100 101 /* scalable */ 102 PetscCall(PetscStrcmp(alg,"scalable",&flg)); 103 if (flg) { 104 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C)); 105 PetscFunctionReturn(0); 106 } 107 108 /* scalable_fast */ 109 PetscCall(PetscStrcmp(alg,"scalable_fast",&flg)); 110 if (flg) { 111 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C)); 112 PetscFunctionReturn(0); 113 } 114 115 /* heap */ 116 PetscCall(PetscStrcmp(alg,"heap",&flg)); 117 if (flg) { 118 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C)); 119 PetscFunctionReturn(0); 120 } 121 122 /* btheap */ 123 PetscCall(PetscStrcmp(alg,"btheap",&flg)); 124 if (flg) { 125 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C)); 126 PetscFunctionReturn(0); 127 } 128 129 /* llcondensed */ 130 PetscCall(PetscStrcmp(alg,"llcondensed",&flg)); 131 if (flg) { 132 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C)); 133 PetscFunctionReturn(0); 134 } 135 136 /* rowmerge */ 137 PetscCall(PetscStrcmp(alg,"rowmerge",&flg)); 138 if (flg) { 139 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C)); 140 PetscFunctionReturn(0); 141 } 142 143 #if defined(PETSC_HAVE_HYPRE) 144 PetscCall(PetscStrcmp(alg,"hypre",&flg)); 145 if (flg) { 146 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C)); 147 PetscFunctionReturn(0); 148 } 149 #endif 150 151 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 152 } 153 154 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C) 155 { 156 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 157 PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 158 PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 159 PetscReal afill; 160 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 161 PetscTable ta; 162 PetscBT lnkbt; 163 PetscFreeSpaceList free_space=NULL,current_space=NULL; 164 165 PetscFunctionBegin; 166 /* Get ci and cj */ 167 /*---------------*/ 168 /* Allocate ci array, arrays for fill computation and */ 169 /* free space for accumulating nonzero column info */ 170 PetscCall(PetscMalloc1(am+2,&ci)); 171 ci[0] = 0; 172 173 /* create and initialize a linked list */ 174 PetscCall(PetscTableCreate(bn,bn,&ta)); 175 MatRowMergeMax_SeqAIJ(b,bm,ta); 176 PetscCall(PetscTableGetCount(ta,&Crmax)); 177 PetscCall(PetscTableDestroy(&ta)); 178 179 PetscCall(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt)); 180 181 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 182 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 183 184 current_space = free_space; 185 186 /* Determine ci and cj */ 187 for (i=0; i<am; i++) { 188 anzi = ai[i+1] - ai[i]; 189 aj = a->j + ai[i]; 190 for (j=0; j<anzi; j++) { 191 brow = aj[j]; 192 bnzj = bi[brow+1] - bi[brow]; 193 bj = b->j + bi[brow]; 194 /* add non-zero cols of B into the sorted linked list lnk */ 195 PetscCall(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt)); 196 } 197 /* add possible missing diagonal entry */ 198 if (C->force_diagonals) { 199 PetscCall(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt)); 200 } 201 cnzi = lnk[0]; 202 203 /* If free space is not available, make more free space */ 204 /* Double the amount of total space in the list */ 205 if (current_space->local_remaining<cnzi) { 206 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space)); 207 ndouble++; 208 } 209 210 /* Copy data into free space, then initialize lnk */ 211 PetscCall(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt)); 212 213 current_space->array += cnzi; 214 current_space->local_used += cnzi; 215 current_space->local_remaining -= cnzi; 216 217 ci[i+1] = ci[i] + cnzi; 218 } 219 220 /* Column indices are in the list of free space */ 221 /* Allocate space for cj, initialize cj, and */ 222 /* destroy list of free space and other temporary array(s) */ 223 PetscCall(PetscMalloc1(ci[am]+1,&cj)); 224 PetscCall(PetscFreeSpaceContiguous(&free_space,cj)); 225 PetscCall(PetscLLCondensedDestroy(lnk,lnkbt)); 226 227 /* put together the new symbolic matrix */ 228 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 229 PetscCall(MatSetBlockSizesFromMats(C,A,B)); 230 231 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 232 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 233 c = (Mat_SeqAIJ*)(C->data); 234 c->free_a = PETSC_FALSE; 235 c->free_ij = PETSC_TRUE; 236 c->nonew = 0; 237 238 /* fast, needs non-scalable O(bn) array 'abdense' */ 239 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 240 241 /* set MatInfo */ 242 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 243 if (afill < 1.0) afill = 1.0; 244 C->info.mallocs = ndouble; 245 C->info.fill_ratio_given = fill; 246 C->info.fill_ratio_needed = afill; 247 248 #if defined(PETSC_USE_INFO) 249 if (ci[am]) { 250 PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 251 PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 252 } else { 253 PetscCall(PetscInfo(C,"Empty matrix product\n")); 254 } 255 #endif 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C) 260 { 261 PetscLogDouble flops=0.0; 262 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 263 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 264 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 265 PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 266 PetscInt am =A->rmap->n,cm=C->rmap->n; 267 PetscInt i,j,k,anzi,bnzi,cnzi,brow; 268 PetscScalar *ca,valtmp; 269 PetscScalar *ab_dense; 270 PetscContainer cab_dense; 271 const PetscScalar *aa,*ba,*baj; 272 273 PetscFunctionBegin; 274 PetscCall(MatSeqAIJGetArrayRead(A,&aa)); 275 PetscCall(MatSeqAIJGetArrayRead(B,&ba)); 276 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 277 PetscCall(PetscMalloc1(ci[cm]+1,&ca)); 278 c->a = ca; 279 c->free_a = PETSC_TRUE; 280 } else ca = c->a; 281 282 /* TODO this should be done in the symbolic phase */ 283 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 284 that is hard to eradicate) */ 285 PetscCall(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense)); 286 if (!cab_dense) { 287 PetscCall(PetscMalloc1(B->cmap->N,&ab_dense)); 288 PetscCall(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense)); 289 PetscCall(PetscContainerSetPointer(cab_dense,ab_dense)); 290 PetscCall(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault)); 291 PetscCall(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense)); 292 PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 293 } 294 PetscCall(PetscContainerGetPointer(cab_dense,(void**)&ab_dense)); 295 PetscCall(PetscArrayzero(ab_dense,B->cmap->N)); 296 297 /* clean old values in C */ 298 PetscCall(PetscArrayzero(ca,ci[cm])); 299 /* Traverse A row-wise. */ 300 /* Build the ith row in C by summing over nonzero columns in A, */ 301 /* the rows of B corresponding to nonzeros of A. */ 302 for (i=0; i<am; i++) { 303 anzi = ai[i+1] - ai[i]; 304 for (j=0; j<anzi; j++) { 305 brow = aj[j]; 306 bnzi = bi[brow+1] - bi[brow]; 307 bjj = bj + bi[brow]; 308 baj = ba + bi[brow]; 309 /* perform dense axpy */ 310 valtmp = aa[j]; 311 for (k=0; k<bnzi; k++) { 312 ab_dense[bjj[k]] += valtmp*baj[k]; 313 } 314 flops += 2*bnzi; 315 } 316 aj += anzi; aa += anzi; 317 318 cnzi = ci[i+1] - ci[i]; 319 for (k=0; k<cnzi; k++) { 320 ca[k] += ab_dense[cj[k]]; 321 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 322 } 323 flops += cnzi; 324 cj += cnzi; ca += cnzi; 325 } 326 #if defined(PETSC_HAVE_DEVICE) 327 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 328 #endif 329 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 330 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 331 PetscCall(PetscLogFlops(flops)); 332 PetscCall(MatSeqAIJRestoreArrayRead(A,&aa)); 333 PetscCall(MatSeqAIJRestoreArrayRead(B,&ba)); 334 PetscFunctionReturn(0); 335 } 336 337 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 338 { 339 PetscLogDouble flops=0.0; 340 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 341 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 342 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 343 PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 344 PetscInt am = A->rmap->N,cm=C->rmap->N; 345 PetscInt i,j,k,anzi,bnzi,cnzi,brow; 346 PetscScalar *ca=c->a,valtmp; 347 const PetscScalar *aa,*ba,*baj; 348 PetscInt nextb; 349 350 PetscFunctionBegin; 351 PetscCall(MatSeqAIJGetArrayRead(A,&aa)); 352 PetscCall(MatSeqAIJGetArrayRead(B,&ba)); 353 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 354 PetscCall(PetscMalloc1(ci[cm]+1,&ca)); 355 c->a = ca; 356 c->free_a = PETSC_TRUE; 357 } 358 359 /* clean old values in C */ 360 PetscCall(PetscArrayzero(ca,ci[cm])); 361 /* Traverse A row-wise. */ 362 /* Build the ith row in C by summing over nonzero columns in A, */ 363 /* the rows of B corresponding to nonzeros of A. */ 364 for (i=0; i<am; i++) { 365 anzi = ai[i+1] - ai[i]; 366 cnzi = ci[i+1] - ci[i]; 367 for (j=0; j<anzi; j++) { 368 brow = aj[j]; 369 bnzi = bi[brow+1] - bi[brow]; 370 bjj = bj + bi[brow]; 371 baj = ba + bi[brow]; 372 /* perform sparse axpy */ 373 valtmp = aa[j]; 374 nextb = 0; 375 for (k=0; nextb<bnzi; k++) { 376 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 377 ca[k] += valtmp*baj[nextb++]; 378 } 379 } 380 flops += 2*bnzi; 381 } 382 aj += anzi; aa += anzi; 383 cj += cnzi; ca += cnzi; 384 } 385 #if defined(PETSC_HAVE_DEVICE) 386 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 387 #endif 388 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 389 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 390 PetscCall(PetscLogFlops(flops)); 391 PetscCall(MatSeqAIJRestoreArrayRead(A,&aa)); 392 PetscCall(MatSeqAIJRestoreArrayRead(B,&ba)); 393 PetscFunctionReturn(0); 394 } 395 396 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C) 397 { 398 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 399 PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 400 PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 401 MatScalar *ca; 402 PetscReal afill; 403 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 404 PetscTable ta; 405 PetscFreeSpaceList free_space=NULL,current_space=NULL; 406 407 PetscFunctionBegin; 408 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 409 /*-----------------------------------------------------------------------------------------*/ 410 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 411 PetscCall(PetscMalloc1(am+2,&ci)); 412 ci[0] = 0; 413 414 /* create and initialize a linked list */ 415 PetscCall(PetscTableCreate(bn,bn,&ta)); 416 MatRowMergeMax_SeqAIJ(b,bm,ta); 417 PetscCall(PetscTableGetCount(ta,&Crmax)); 418 PetscCall(PetscTableDestroy(&ta)); 419 420 PetscCall(PetscLLCondensedCreate_fast(Crmax,&lnk)); 421 422 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 423 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 424 current_space = free_space; 425 426 /* Determine ci and cj */ 427 for (i=0; i<am; i++) { 428 anzi = ai[i+1] - ai[i]; 429 aj = a->j + ai[i]; 430 for (j=0; j<anzi; j++) { 431 brow = aj[j]; 432 bnzj = bi[brow+1] - bi[brow]; 433 bj = b->j + bi[brow]; 434 /* add non-zero cols of B into the sorted linked list lnk */ 435 PetscCall(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk)); 436 } 437 /* add possible missing diagonal entry */ 438 if (C->force_diagonals) { 439 PetscCall(PetscLLCondensedAddSorted_fast(1,&i,lnk)); 440 } 441 cnzi = lnk[1]; 442 443 /* If free space is not available, make more free space */ 444 /* Double the amount of total space in the list */ 445 if (current_space->local_remaining<cnzi) { 446 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space)); 447 ndouble++; 448 } 449 450 /* Copy data into free space, then initialize lnk */ 451 PetscCall(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk)); 452 453 current_space->array += cnzi; 454 current_space->local_used += cnzi; 455 current_space->local_remaining -= cnzi; 456 457 ci[i+1] = ci[i] + cnzi; 458 } 459 460 /* Column indices are in the list of free space */ 461 /* Allocate space for cj, initialize cj, and */ 462 /* destroy list of free space and other temporary array(s) */ 463 PetscCall(PetscMalloc1(ci[am]+1,&cj)); 464 PetscCall(PetscFreeSpaceContiguous(&free_space,cj)); 465 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 466 467 /* Allocate space for ca */ 468 PetscCall(PetscCalloc1(ci[am]+1,&ca)); 469 470 /* put together the new symbolic matrix */ 471 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C)); 472 PetscCall(MatSetBlockSizesFromMats(C,A,B)); 473 474 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 475 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 476 c = (Mat_SeqAIJ*)(C->data); 477 c->free_a = PETSC_TRUE; 478 c->free_ij = PETSC_TRUE; 479 c->nonew = 0; 480 481 /* slower, less memory */ 482 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 483 484 /* set MatInfo */ 485 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 486 if (afill < 1.0) afill = 1.0; 487 C->info.mallocs = ndouble; 488 C->info.fill_ratio_given = fill; 489 C->info.fill_ratio_needed = afill; 490 491 #if defined(PETSC_USE_INFO) 492 if (ci[am]) { 493 PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 494 PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 495 } else { 496 PetscCall(PetscInfo(C,"Empty matrix product\n")); 497 } 498 #endif 499 PetscFunctionReturn(0); 500 } 501 502 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C) 503 { 504 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 505 PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 506 PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 507 MatScalar *ca; 508 PetscReal afill; 509 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 510 PetscTable ta; 511 PetscFreeSpaceList free_space=NULL,current_space=NULL; 512 513 PetscFunctionBegin; 514 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 515 /*---------------------------------------------------------------------------------------------*/ 516 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 517 PetscCall(PetscMalloc1(am+2,&ci)); 518 ci[0] = 0; 519 520 /* create and initialize a linked list */ 521 PetscCall(PetscTableCreate(bn,bn,&ta)); 522 MatRowMergeMax_SeqAIJ(b,bm,ta); 523 PetscCall(PetscTableGetCount(ta,&Crmax)); 524 PetscCall(PetscTableDestroy(&ta)); 525 PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk)); 526 527 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 528 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 529 current_space = free_space; 530 531 /* Determine ci and cj */ 532 for (i=0; i<am; i++) { 533 anzi = ai[i+1] - ai[i]; 534 aj = a->j + ai[i]; 535 for (j=0; j<anzi; j++) { 536 brow = aj[j]; 537 bnzj = bi[brow+1] - bi[brow]; 538 bj = b->j + bi[brow]; 539 /* add non-zero cols of B into the sorted linked list lnk */ 540 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk)); 541 } 542 /* add possible missing diagonal entry */ 543 if (C->force_diagonals) { 544 PetscCall(PetscLLCondensedAddSorted_Scalable(1,&i,lnk)); 545 } 546 547 cnzi = lnk[0]; 548 549 /* If free space is not available, make more free space */ 550 /* Double the amount of total space in the list */ 551 if (current_space->local_remaining<cnzi) { 552 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space)); 553 ndouble++; 554 } 555 556 /* Copy data into free space, then initialize lnk */ 557 PetscCall(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk)); 558 559 current_space->array += cnzi; 560 current_space->local_used += cnzi; 561 current_space->local_remaining -= cnzi; 562 563 ci[i+1] = ci[i] + cnzi; 564 } 565 566 /* Column indices are in the list of free space */ 567 /* Allocate space for cj, initialize cj, and */ 568 /* destroy list of free space and other temporary array(s) */ 569 PetscCall(PetscMalloc1(ci[am]+1,&cj)); 570 PetscCall(PetscFreeSpaceContiguous(&free_space,cj)); 571 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 572 573 /* Allocate space for ca */ 574 /*-----------------------*/ 575 PetscCall(PetscCalloc1(ci[am]+1,&ca)); 576 577 /* put together the new symbolic matrix */ 578 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C)); 579 PetscCall(MatSetBlockSizesFromMats(C,A,B)); 580 581 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 582 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 583 c = (Mat_SeqAIJ*)(C->data); 584 c->free_a = PETSC_TRUE; 585 c->free_ij = PETSC_TRUE; 586 c->nonew = 0; 587 588 /* slower, less memory */ 589 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 590 591 /* set MatInfo */ 592 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 593 if (afill < 1.0) afill = 1.0; 594 C->info.mallocs = ndouble; 595 C->info.fill_ratio_given = fill; 596 C->info.fill_ratio_needed = afill; 597 598 #if defined(PETSC_USE_INFO) 599 if (ci[am]) { 600 PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 601 PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 602 } else { 603 PetscCall(PetscInfo(C,"Empty matrix product\n")); 604 } 605 #endif 606 PetscFunctionReturn(0); 607 } 608 609 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C) 610 { 611 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 612 const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 613 PetscInt *ci,*cj,*bb; 614 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 615 PetscReal afill; 616 PetscInt i,j,col,ndouble = 0; 617 PetscFreeSpaceList free_space=NULL,current_space=NULL; 618 PetscHeap h; 619 620 PetscFunctionBegin; 621 /* Get ci and cj - by merging sorted rows using a heap */ 622 /*---------------------------------------------------------------------------------------------*/ 623 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 624 PetscCall(PetscMalloc1(am+2,&ci)); 625 ci[0] = 0; 626 627 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 628 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 629 current_space = free_space; 630 631 PetscCall(PetscHeapCreate(a->rmax,&h)); 632 PetscCall(PetscMalloc1(a->rmax,&bb)); 633 634 /* Determine ci and cj */ 635 for (i=0; i<am; i++) { 636 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 637 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 638 ci[i+1] = ci[i]; 639 /* Populate the min heap */ 640 for (j=0; j<anzi; j++) { 641 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 642 if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 643 PetscCall(PetscHeapAdd(h,j,bj[bb[j]++])); 644 } 645 } 646 /* Pick off the min element, adding it to free space */ 647 PetscCall(PetscHeapPop(h,&j,&col)); 648 while (j >= 0) { 649 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 650 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space)); 651 ndouble++; 652 } 653 *(current_space->array++) = col; 654 current_space->local_used++; 655 current_space->local_remaining--; 656 ci[i+1]++; 657 658 /* stash if anything else remains in this row of B */ 659 if (bb[j] < bi[acol[j]+1]) PetscCall(PetscHeapStash(h,j,bj[bb[j]++])); 660 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 661 PetscInt j2,col2; 662 PetscCall(PetscHeapPeek(h,&j2,&col2)); 663 if (col2 != col) break; 664 PetscCall(PetscHeapPop(h,&j2,&col2)); 665 if (bb[j2] < bi[acol[j2]+1]) PetscCall(PetscHeapStash(h,j2,bj[bb[j2]++])); 666 } 667 /* Put any stashed elements back into the min heap */ 668 PetscCall(PetscHeapUnstash(h)); 669 PetscCall(PetscHeapPop(h,&j,&col)); 670 } 671 } 672 PetscCall(PetscFree(bb)); 673 PetscCall(PetscHeapDestroy(&h)); 674 675 /* Column indices are in the list of free space */ 676 /* Allocate space for cj, initialize cj, and */ 677 /* destroy list of free space and other temporary array(s) */ 678 PetscCall(PetscMalloc1(ci[am],&cj)); 679 PetscCall(PetscFreeSpaceContiguous(&free_space,cj)); 680 681 /* put together the new symbolic matrix */ 682 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 683 PetscCall(MatSetBlockSizesFromMats(C,A,B)); 684 685 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 686 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 687 c = (Mat_SeqAIJ*)(C->data); 688 c->free_a = PETSC_TRUE; 689 c->free_ij = PETSC_TRUE; 690 c->nonew = 0; 691 692 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 693 694 /* set MatInfo */ 695 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 696 if (afill < 1.0) afill = 1.0; 697 C->info.mallocs = ndouble; 698 C->info.fill_ratio_given = fill; 699 C->info.fill_ratio_needed = afill; 700 701 #if defined(PETSC_USE_INFO) 702 if (ci[am]) { 703 PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 704 PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 705 } else { 706 PetscCall(PetscInfo(C,"Empty matrix product\n")); 707 } 708 #endif 709 PetscFunctionReturn(0); 710 } 711 712 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C) 713 { 714 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 715 const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 716 PetscInt *ci,*cj,*bb; 717 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 718 PetscReal afill; 719 PetscInt i,j,col,ndouble = 0; 720 PetscFreeSpaceList free_space=NULL,current_space=NULL; 721 PetscHeap h; 722 PetscBT bt; 723 724 PetscFunctionBegin; 725 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 726 /*---------------------------------------------------------------------------------------------*/ 727 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 728 PetscCall(PetscMalloc1(am+2,&ci)); 729 ci[0] = 0; 730 731 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 732 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 733 734 current_space = free_space; 735 736 PetscCall(PetscHeapCreate(a->rmax,&h)); 737 PetscCall(PetscMalloc1(a->rmax,&bb)); 738 PetscCall(PetscBTCreate(bn,&bt)); 739 740 /* Determine ci and cj */ 741 for (i=0; i<am; i++) { 742 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 743 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 744 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 745 ci[i+1] = ci[i]; 746 /* Populate the min heap */ 747 for (j=0; j<anzi; j++) { 748 PetscInt brow = acol[j]; 749 for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 750 PetscInt bcol = bj[bb[j]]; 751 if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 752 PetscCall(PetscHeapAdd(h,j,bcol)); 753 bb[j]++; 754 break; 755 } 756 } 757 } 758 /* Pick off the min element, adding it to free space */ 759 PetscCall(PetscHeapPop(h,&j,&col)); 760 while (j >= 0) { 761 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 762 fptr = NULL; /* need PetscBTMemzero */ 763 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space)); 764 ndouble++; 765 } 766 *(current_space->array++) = col; 767 current_space->local_used++; 768 current_space->local_remaining--; 769 ci[i+1]++; 770 771 /* stash if anything else remains in this row of B */ 772 for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 773 PetscInt bcol = bj[bb[j]]; 774 if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 775 PetscCall(PetscHeapAdd(h,j,bcol)); 776 bb[j]++; 777 break; 778 } 779 } 780 PetscCall(PetscHeapPop(h,&j,&col)); 781 } 782 if (fptr) { /* Clear the bits for this row */ 783 for (; fptr<current_space->array; fptr++) PetscCall(PetscBTClear(bt,*fptr)); 784 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 785 PetscCall(PetscBTMemzero(bn,bt)); 786 } 787 } 788 PetscCall(PetscFree(bb)); 789 PetscCall(PetscHeapDestroy(&h)); 790 PetscCall(PetscBTDestroy(&bt)); 791 792 /* Column indices are in the list of free space */ 793 /* Allocate space for cj, initialize cj, and */ 794 /* destroy list of free space and other temporary array(s) */ 795 PetscCall(PetscMalloc1(ci[am],&cj)); 796 PetscCall(PetscFreeSpaceContiguous(&free_space,cj)); 797 798 /* put together the new symbolic matrix */ 799 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 800 PetscCall(MatSetBlockSizesFromMats(C,A,B)); 801 802 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 803 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 804 c = (Mat_SeqAIJ*)(C->data); 805 c->free_a = PETSC_TRUE; 806 c->free_ij = PETSC_TRUE; 807 c->nonew = 0; 808 809 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 810 811 /* set MatInfo */ 812 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 813 if (afill < 1.0) afill = 1.0; 814 C->info.mallocs = ndouble; 815 C->info.fill_ratio_given = fill; 816 C->info.fill_ratio_needed = afill; 817 818 #if defined(PETSC_USE_INFO) 819 if (ci[am]) { 820 PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 821 PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 822 } else { 823 PetscCall(PetscInfo(C,"Empty matrix product\n")); 824 } 825 #endif 826 PetscFunctionReturn(0); 827 } 828 829 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C) 830 { 831 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 832 const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 833 PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 834 PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 835 const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 836 const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 837 const PetscInt *brow_ptr[8],*brow_end[8]; 838 PetscInt window[8]; 839 PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 840 PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 841 PetscReal afill; 842 PetscInt *workj_L1,*workj_L2,*workj_L3; 843 PetscInt L1_nnz,L2_nnz; 844 845 /* Step 1: Get upper bound on memory required for allocation. 846 Because of the way virtual memory works, 847 only the memory pages that are actually needed will be physically allocated. */ 848 PetscFunctionBegin; 849 PetscCall(PetscMalloc1(am+1,&ci)); 850 for (i=0; i<am; i++) { 851 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 852 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 853 a_rownnz = 0; 854 for (k=0; k<anzi; ++k) { 855 a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 856 if (a_rownnz > bn) { 857 a_rownnz = bn; 858 break; 859 } 860 } 861 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 862 } 863 /* temporary work areas for merging rows */ 864 PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L1)); 865 PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L2)); 866 PetscCall(PetscMalloc1(a_maxrownnz,&workj_L3)); 867 868 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 869 c_maxmem = 8*(ai[am]+bi[bm]); 870 /* Step 2: Populate pattern for C */ 871 PetscCall(PetscMalloc1(c_maxmem,&cj)); 872 873 ci_nnz = 0; 874 ci[0] = 0; 875 worki_L1[0] = 0; 876 worki_L2[0] = 0; 877 for (i=0; i<am; i++) { 878 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 879 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 880 rowsleft = anzi; 881 inputcol_L1 = acol; 882 L2_nnz = 0; 883 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 884 worki_L2[1] = 0; 885 outputi_nnz = 0; 886 887 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 888 while (ci_nnz+a_maxrownnz > c_maxmem) { 889 c_maxmem *= 2; 890 ndouble++; 891 PetscCall(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj)); 892 } 893 894 while (rowsleft) { 895 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 896 L1_nrows = 0; 897 L1_nnz = 0; 898 inputcol = inputcol_L1; 899 inputi = bi; 900 inputj = bj; 901 902 /* The following macro is used to specialize for small rows in A. 903 This helps with compiler unrolling, improving performance substantially. 904 Input: inputj inputi inputcol bn 905 Output: outputj outputi_nnz */ 906 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 907 window_min = bn; \ 908 outputi_nnz = 0; \ 909 for (k=0; k<ANNZ; ++k) { \ 910 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 911 brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 912 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 913 window_min = PetscMin(window[k], window_min); \ 914 } \ 915 while (window_min < bn) { \ 916 outputj[outputi_nnz++] = window_min; \ 917 /* advance front and compute new minimum */ \ 918 old_window_min = window_min; \ 919 window_min = bn; \ 920 for (k=0; k<ANNZ; ++k) { \ 921 if (window[k] == old_window_min) { \ 922 brow_ptr[k]++; \ 923 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 924 } \ 925 window_min = PetscMin(window[k], window_min); \ 926 } \ 927 } 928 929 /************** L E V E L 1 ***************/ 930 /* Merge up to 8 rows of B to L1 work array*/ 931 while (L1_rowsleft) { 932 outputi_nnz = 0; 933 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 934 else outputj = cj + ci_nnz; /* Merge directly to C */ 935 936 switch (L1_rowsleft) { 937 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 938 brow_end[0] = inputj + inputi[inputcol[0]+1]; 939 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 940 inputcol += L1_rowsleft; 941 rowsleft -= L1_rowsleft; 942 L1_rowsleft = 0; 943 break; 944 case 2: MatMatMultSymbolic_RowMergeMacro(2); 945 inputcol += L1_rowsleft; 946 rowsleft -= L1_rowsleft; 947 L1_rowsleft = 0; 948 break; 949 case 3: MatMatMultSymbolic_RowMergeMacro(3); 950 inputcol += L1_rowsleft; 951 rowsleft -= L1_rowsleft; 952 L1_rowsleft = 0; 953 break; 954 case 4: MatMatMultSymbolic_RowMergeMacro(4); 955 inputcol += L1_rowsleft; 956 rowsleft -= L1_rowsleft; 957 L1_rowsleft = 0; 958 break; 959 case 5: MatMatMultSymbolic_RowMergeMacro(5); 960 inputcol += L1_rowsleft; 961 rowsleft -= L1_rowsleft; 962 L1_rowsleft = 0; 963 break; 964 case 6: MatMatMultSymbolic_RowMergeMacro(6); 965 inputcol += L1_rowsleft; 966 rowsleft -= L1_rowsleft; 967 L1_rowsleft = 0; 968 break; 969 case 7: MatMatMultSymbolic_RowMergeMacro(7); 970 inputcol += L1_rowsleft; 971 rowsleft -= L1_rowsleft; 972 L1_rowsleft = 0; 973 break; 974 default: MatMatMultSymbolic_RowMergeMacro(8); 975 inputcol += 8; 976 rowsleft -= 8; 977 L1_rowsleft -= 8; 978 break; 979 } 980 inputcol_L1 = inputcol; 981 L1_nnz += outputi_nnz; 982 worki_L1[++L1_nrows] = L1_nnz; 983 } 984 985 /********************** L E V E L 2 ************************/ 986 /* Merge from L1 work array to either C or to L2 work array */ 987 if (anzi > 8) { 988 inputi = worki_L1; 989 inputj = workj_L1; 990 inputcol = workcol; 991 outputi_nnz = 0; 992 993 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 994 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 995 996 switch (L1_nrows) { 997 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 998 brow_end[0] = inputj + inputi[inputcol[0]+1]; 999 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1000 break; 1001 case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1002 case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1003 case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1004 case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1005 case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1006 case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1007 case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1008 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1009 } 1010 L2_nnz += outputi_nnz; 1011 worki_L2[++L2_nrows] = L2_nnz; 1012 1013 /************************ L E V E L 3 **********************/ 1014 /* Merge from L2 work array to either C or to L2 work array */ 1015 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1016 inputi = worki_L2; 1017 inputj = workj_L2; 1018 inputcol = workcol; 1019 outputi_nnz = 0; 1020 if (rowsleft) outputj = workj_L3; 1021 else outputj = cj + ci_nnz; 1022 switch (L2_nrows) { 1023 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1024 brow_end[0] = inputj + inputi[inputcol[0]+1]; 1025 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1026 break; 1027 case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1028 case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1029 case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1030 case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1031 case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1032 case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1033 case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1034 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1035 } 1036 L2_nrows = 1; 1037 L2_nnz = outputi_nnz; 1038 worki_L2[1] = outputi_nnz; 1039 /* Copy to workj_L2 */ 1040 if (rowsleft) { 1041 for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1042 } 1043 } 1044 } 1045 } /* while (rowsleft) */ 1046 #undef MatMatMultSymbolic_RowMergeMacro 1047 1048 /* terminate current row */ 1049 ci_nnz += outputi_nnz; 1050 ci[i+1] = ci_nnz; 1051 } 1052 1053 /* Step 3: Create the new symbolic matrix */ 1054 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 1055 PetscCall(MatSetBlockSizesFromMats(C,A,B)); 1056 1057 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1058 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1059 c = (Mat_SeqAIJ*)(C->data); 1060 c->free_a = PETSC_TRUE; 1061 c->free_ij = PETSC_TRUE; 1062 c->nonew = 0; 1063 1064 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1065 1066 /* set MatInfo */ 1067 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1068 if (afill < 1.0) afill = 1.0; 1069 C->info.mallocs = ndouble; 1070 C->info.fill_ratio_given = fill; 1071 C->info.fill_ratio_needed = afill; 1072 1073 #if defined(PETSC_USE_INFO) 1074 if (ci[am]) { 1075 PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 1076 PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 1077 } else { 1078 PetscCall(PetscInfo(C,"Empty matrix product\n")); 1079 } 1080 #endif 1081 1082 /* Step 4: Free temporary work areas */ 1083 PetscCall(PetscFree(workj_L1)); 1084 PetscCall(PetscFree(workj_L2)); 1085 PetscCall(PetscFree(workj_L3)); 1086 PetscFunctionReturn(0); 1087 } 1088 1089 /* concatenate unique entries and then sort */ 1090 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C) 1091 { 1092 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1093 const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 1094 PetscInt *ci,*cj,bcol; 1095 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1096 PetscReal afill; 1097 PetscInt i,j,ndouble = 0; 1098 PetscSegBuffer seg,segrow; 1099 char *seen; 1100 1101 PetscFunctionBegin; 1102 PetscCall(PetscMalloc1(am+1,&ci)); 1103 ci[0] = 0; 1104 1105 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1106 PetscCall(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg)); 1107 PetscCall(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow)); 1108 PetscCall(PetscCalloc1(bn,&seen)); 1109 1110 /* Determine ci and cj */ 1111 for (i=0; i<am; i++) { 1112 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1113 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1114 PetscInt packlen = 0,*PETSC_RESTRICT crow; 1115 1116 /* Pack segrow */ 1117 for (j=0; j<anzi; j++) { 1118 PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1119 for (k=bjstart; k<bjend; k++) { 1120 bcol = bj[k]; 1121 if (!seen[bcol]) { /* new entry */ 1122 PetscInt *PETSC_RESTRICT slot; 1123 PetscCall(PetscSegBufferGetInts(segrow,1,&slot)); 1124 *slot = bcol; 1125 seen[bcol] = 1; 1126 packlen++; 1127 } 1128 } 1129 } 1130 1131 /* Check i-th diagonal entry */ 1132 if (C->force_diagonals && !seen[i]) { 1133 PetscInt *PETSC_RESTRICT slot; 1134 PetscCall(PetscSegBufferGetInts(segrow,1,&slot)); 1135 *slot = i; 1136 seen[i] = 1; 1137 packlen++; 1138 } 1139 1140 PetscCall(PetscSegBufferGetInts(seg,packlen,&crow)); 1141 PetscCall(PetscSegBufferExtractTo(segrow,crow)); 1142 PetscCall(PetscSortInt(packlen,crow)); 1143 ci[i+1] = ci[i] + packlen; 1144 for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1145 } 1146 PetscCall(PetscSegBufferDestroy(&segrow)); 1147 PetscCall(PetscFree(seen)); 1148 1149 /* Column indices are in the segmented buffer */ 1150 PetscCall(PetscSegBufferExtractAlloc(seg,&cj)); 1151 PetscCall(PetscSegBufferDestroy(&seg)); 1152 1153 /* put together the new symbolic matrix */ 1154 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 1155 PetscCall(MatSetBlockSizesFromMats(C,A,B)); 1156 1157 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1158 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1159 c = (Mat_SeqAIJ*)(C->data); 1160 c->free_a = PETSC_TRUE; 1161 c->free_ij = PETSC_TRUE; 1162 c->nonew = 0; 1163 1164 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1165 1166 /* set MatInfo */ 1167 afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5; 1168 if (afill < 1.0) afill = 1.0; 1169 C->info.mallocs = ndouble; 1170 C->info.fill_ratio_given = fill; 1171 C->info.fill_ratio_needed = afill; 1172 1173 #if defined(PETSC_USE_INFO) 1174 if (ci[am]) { 1175 PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 1176 PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 1177 } else { 1178 PetscCall(PetscInfo(C,"Empty matrix product\n")); 1179 } 1180 #endif 1181 PetscFunctionReturn(0); 1182 } 1183 1184 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1185 { 1186 Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data; 1187 1188 PetscFunctionBegin; 1189 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1190 PetscCall(MatDestroy(&abt->Bt_den)); 1191 PetscCall(MatDestroy(&abt->ABt_den)); 1192 PetscCall(PetscFree(abt)); 1193 PetscFunctionReturn(0); 1194 } 1195 1196 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1197 { 1198 Mat Bt; 1199 PetscInt *bti,*btj; 1200 Mat_MatMatTransMult *abt; 1201 Mat_Product *product = C->product; 1202 char *alg; 1203 1204 PetscFunctionBegin; 1205 PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1206 PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 1207 1208 /* create symbolic Bt */ 1209 PetscCall(MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj)); 1210 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt)); 1211 PetscCall(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs))); 1212 PetscCall(MatSetType(Bt,((PetscObject)A)->type_name)); 1213 1214 /* get symbolic C=A*Bt */ 1215 PetscCall(PetscStrallocpy(product->alg,&alg)); 1216 PetscCall(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */ 1217 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C)); 1218 PetscCall(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */ 1219 PetscCall(PetscFree(alg)); 1220 1221 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1222 PetscCall(PetscNew(&abt)); 1223 1224 product->data = abt; 1225 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1226 1227 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1228 1229 abt->usecoloring = PETSC_FALSE; 1230 PetscCall(PetscStrcmp(product->alg,"color",&abt->usecoloring)); 1231 if (abt->usecoloring) { 1232 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1233 MatTransposeColoring matcoloring; 1234 MatColoring coloring; 1235 ISColoring iscoloring; 1236 Mat Bt_dense,C_dense; 1237 1238 /* inode causes memory problem */ 1239 PetscCall(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE)); 1240 1241 PetscCall(MatColoringCreate(C,&coloring)); 1242 PetscCall(MatColoringSetDistance(coloring,2)); 1243 PetscCall(MatColoringSetType(coloring,MATCOLORINGSL)); 1244 PetscCall(MatColoringSetFromOptions(coloring)); 1245 PetscCall(MatColoringApply(coloring,&iscoloring)); 1246 PetscCall(MatColoringDestroy(&coloring)); 1247 PetscCall(MatTransposeColoringCreate(C,iscoloring,&matcoloring)); 1248 1249 abt->matcoloring = matcoloring; 1250 1251 PetscCall(ISColoringDestroy(&iscoloring)); 1252 1253 /* Create Bt_dense and C_dense = A*Bt_dense */ 1254 PetscCall(MatCreate(PETSC_COMM_SELF,&Bt_dense)); 1255 PetscCall(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors)); 1256 PetscCall(MatSetType(Bt_dense,MATSEQDENSE)); 1257 PetscCall(MatSeqDenseSetPreallocation(Bt_dense,NULL)); 1258 1259 Bt_dense->assembled = PETSC_TRUE; 1260 abt->Bt_den = Bt_dense; 1261 1262 PetscCall(MatCreate(PETSC_COMM_SELF,&C_dense)); 1263 PetscCall(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors)); 1264 PetscCall(MatSetType(C_dense,MATSEQDENSE)); 1265 PetscCall(MatSeqDenseSetPreallocation(C_dense,NULL)); 1266 1267 Bt_dense->assembled = PETSC_TRUE; 1268 abt->ABt_den = C_dense; 1269 1270 #if defined(PETSC_USE_INFO) 1271 { 1272 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1273 PetscCall(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors))))); 1274 } 1275 #endif 1276 } 1277 /* clean up */ 1278 PetscCall(MatDestroy(&Bt)); 1279 PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj)); 1280 PetscFunctionReturn(0); 1281 } 1282 1283 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1284 { 1285 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1286 PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 1287 PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 1288 PetscLogDouble flops=0.0; 1289 MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 1290 Mat_MatMatTransMult *abt; 1291 Mat_Product *product = C->product; 1292 1293 PetscFunctionBegin; 1294 PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1295 abt = (Mat_MatMatTransMult *)product->data; 1296 PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1297 /* clear old values in C */ 1298 if (!c->a) { 1299 PetscCall(PetscCalloc1(ci[cm]+1,&ca)); 1300 c->a = ca; 1301 c->free_a = PETSC_TRUE; 1302 } else { 1303 ca = c->a; 1304 PetscCall(PetscArrayzero(ca,ci[cm]+1)); 1305 } 1306 1307 if (abt->usecoloring) { 1308 MatTransposeColoring matcoloring = abt->matcoloring; 1309 Mat Bt_dense,C_dense = abt->ABt_den; 1310 1311 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1312 Bt_dense = abt->Bt_den; 1313 PetscCall(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense)); 1314 1315 /* C_dense = A*Bt_dense */ 1316 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense)); 1317 1318 /* Recover C from C_dense */ 1319 PetscCall(MatTransColoringApplyDenToSp(matcoloring,C_dense,C)); 1320 PetscFunctionReturn(0); 1321 } 1322 1323 for (i=0; i<cm; i++) { 1324 anzi = ai[i+1] - ai[i]; 1325 acol = aj + ai[i]; 1326 aval = aa + ai[i]; 1327 cnzi = ci[i+1] - ci[i]; 1328 ccol = cj + ci[i]; 1329 cval = ca + ci[i]; 1330 for (j=0; j<cnzi; j++) { 1331 brow = ccol[j]; 1332 bnzj = bi[brow+1] - bi[brow]; 1333 bcol = bj + bi[brow]; 1334 bval = ba + bi[brow]; 1335 1336 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1337 nexta = 0; nextb = 0; 1338 while (nexta<anzi && nextb<bnzj) { 1339 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1340 if (nexta == anzi) break; 1341 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1342 if (nextb == bnzj) break; 1343 if (acol[nexta] == bcol[nextb]) { 1344 cval[j] += aval[nexta]*bval[nextb]; 1345 nexta++; nextb++; 1346 flops += 2; 1347 } 1348 } 1349 } 1350 } 1351 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 1352 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 1353 PetscCall(PetscLogFlops(flops)); 1354 PetscFunctionReturn(0); 1355 } 1356 1357 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1358 { 1359 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data; 1360 1361 PetscFunctionBegin; 1362 PetscCall(MatDestroy(&atb->At)); 1363 if (atb->destroy) { 1364 PetscCall((*atb->destroy)(atb->data)); 1365 } 1366 PetscCall(PetscFree(atb)); 1367 PetscFunctionReturn(0); 1368 } 1369 1370 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1371 { 1372 Mat At = NULL; 1373 PetscInt *ati,*atj; 1374 Mat_Product *product = C->product; 1375 PetscBool flg,def,square; 1376 1377 PetscFunctionBegin; 1378 MatCheckProduct(C,4); 1379 square = (PetscBool)(A == B && A->symmetric && A->symmetric_set); 1380 /* outerproduct */ 1381 PetscCall(PetscStrcmp(product->alg,"outerproduct",&flg)); 1382 if (flg) { 1383 /* create symbolic At */ 1384 if (!square) { 1385 PetscCall(MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj)); 1386 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At)); 1387 PetscCall(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs))); 1388 PetscCall(MatSetType(At,((PetscObject)A)->type_name)); 1389 } 1390 /* get symbolic C=At*B */ 1391 PetscCall(MatProductSetAlgorithm(C,"sorted")); 1392 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C)); 1393 1394 /* clean up */ 1395 if (!square) { 1396 PetscCall(MatDestroy(&At)); 1397 PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj)); 1398 } 1399 1400 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1401 PetscCall(MatProductSetAlgorithm(C,"outerproduct")); 1402 PetscFunctionReturn(0); 1403 } 1404 1405 /* matmatmult */ 1406 PetscCall(PetscStrcmp(product->alg,"default",&def)); 1407 PetscCall(PetscStrcmp(product->alg,"at*b",&flg)); 1408 if (flg || def) { 1409 Mat_MatTransMatMult *atb; 1410 1411 PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 1412 PetscCall(PetscNew(&atb)); 1413 if (!square) { 1414 PetscCall(MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At)); 1415 } 1416 PetscCall(MatProductSetAlgorithm(C,"sorted")); 1417 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C)); 1418 PetscCall(MatProductSetAlgorithm(C,"at*b")); 1419 product->data = atb; 1420 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1421 atb->At = At; 1422 atb->updateAt = PETSC_FALSE; /* because At is computed here */ 1423 1424 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1425 PetscFunctionReturn(0); 1426 } 1427 1428 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1429 } 1430 1431 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1432 { 1433 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1434 PetscInt am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1435 PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1436 PetscLogDouble flops=0.0; 1437 MatScalar *aa=a->a,*ba,*ca,*caj; 1438 1439 PetscFunctionBegin; 1440 if (!c->a) { 1441 PetscCall(PetscCalloc1(ci[cm]+1,&ca)); 1442 1443 c->a = ca; 1444 c->free_a = PETSC_TRUE; 1445 } else { 1446 ca = c->a; 1447 PetscCall(PetscArrayzero(ca,ci[cm])); 1448 } 1449 1450 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1451 for (i=0; i<am; i++) { 1452 bj = b->j + bi[i]; 1453 ba = b->a + bi[i]; 1454 bnzi = bi[i+1] - bi[i]; 1455 anzi = ai[i+1] - ai[i]; 1456 for (j=0; j<anzi; j++) { 1457 nextb = 0; 1458 crow = *aj++; 1459 cjj = cj + ci[crow]; 1460 caj = ca + ci[crow]; 1461 /* perform sparse axpy operation. Note cjj includes bj. */ 1462 for (k=0; nextb<bnzi; k++) { 1463 if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 1464 caj[k] += (*aa)*(*(ba+nextb)); 1465 nextb++; 1466 } 1467 } 1468 flops += 2*bnzi; 1469 aa++; 1470 } 1471 } 1472 1473 /* Assemble the final matrix and clean up */ 1474 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 1475 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 1476 PetscCall(PetscLogFlops(flops)); 1477 PetscFunctionReturn(0); 1478 } 1479 1480 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 1481 { 1482 PetscFunctionBegin; 1483 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C)); 1484 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1485 PetscFunctionReturn(0); 1486 } 1487 1488 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add) 1489 { 1490 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1491 PetscScalar *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4; 1492 const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av; 1493 const PetscInt *aj; 1494 PetscInt cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n; 1495 PetscInt clda; 1496 PetscInt am4,bm4,col,i,j,n; 1497 1498 PetscFunctionBegin; 1499 if (!cm || !cn) PetscFunctionReturn(0); 1500 PetscCall(MatSeqAIJGetArrayRead(A,&av)); 1501 if (add) { 1502 PetscCall(MatDenseGetArray(C,&c)); 1503 } else { 1504 PetscCall(MatDenseGetArrayWrite(C,&c)); 1505 } 1506 PetscCall(MatDenseGetArrayRead(B,&b)); 1507 PetscCall(MatDenseGetLDA(B,&bm)); 1508 PetscCall(MatDenseGetLDA(C,&clda)); 1509 am4 = 4*clda; 1510 bm4 = 4*bm; 1511 b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 1512 c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda; 1513 for (col=0; col<(cn/4)*4; col += 4) { /* over columns of C */ 1514 for (i=0; i<am; i++) { /* over rows of A in those columns */ 1515 r1 = r2 = r3 = r4 = 0.0; 1516 n = a->i[i+1] - a->i[i]; 1517 aj = a->j + a->i[i]; 1518 aa = av + a->i[i]; 1519 for (j=0; j<n; j++) { 1520 const PetscScalar aatmp = aa[j]; 1521 const PetscInt ajtmp = aj[j]; 1522 r1 += aatmp*b1[ajtmp]; 1523 r2 += aatmp*b2[ajtmp]; 1524 r3 += aatmp*b3[ajtmp]; 1525 r4 += aatmp*b4[ajtmp]; 1526 } 1527 if (add) { 1528 c1[i] += r1; 1529 c2[i] += r2; 1530 c3[i] += r3; 1531 c4[i] += r4; 1532 } else { 1533 c1[i] = r1; 1534 c2[i] = r2; 1535 c3[i] = r3; 1536 c4[i] = r4; 1537 } 1538 } 1539 b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1540 c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1541 } 1542 /* process remaining columns */ 1543 if (col != cn) { 1544 PetscInt rc = cn-col; 1545 1546 if (rc == 1) { 1547 for (i=0; i<am; i++) { 1548 r1 = 0.0; 1549 n = a->i[i+1] - a->i[i]; 1550 aj = a->j + a->i[i]; 1551 aa = av + a->i[i]; 1552 for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]]; 1553 if (add) c1[i] += r1; 1554 else c1[i] = r1; 1555 } 1556 } else if (rc == 2) { 1557 for (i=0; i<am; i++) { 1558 r1 = r2 = 0.0; 1559 n = a->i[i+1] - a->i[i]; 1560 aj = a->j + a->i[i]; 1561 aa = av + a->i[i]; 1562 for (j=0; j<n; j++) { 1563 const PetscScalar aatmp = aa[j]; 1564 const PetscInt ajtmp = aj[j]; 1565 r1 += aatmp*b1[ajtmp]; 1566 r2 += aatmp*b2[ajtmp]; 1567 } 1568 if (add) { 1569 c1[i] += r1; 1570 c2[i] += r2; 1571 } else { 1572 c1[i] = r1; 1573 c2[i] = r2; 1574 } 1575 } 1576 } else { 1577 for (i=0; i<am; i++) { 1578 r1 = r2 = r3 = 0.0; 1579 n = a->i[i+1] - a->i[i]; 1580 aj = a->j + a->i[i]; 1581 aa = av + a->i[i]; 1582 for (j=0; j<n; j++) { 1583 const PetscScalar aatmp = aa[j]; 1584 const PetscInt ajtmp = aj[j]; 1585 r1 += aatmp*b1[ajtmp]; 1586 r2 += aatmp*b2[ajtmp]; 1587 r3 += aatmp*b3[ajtmp]; 1588 } 1589 if (add) { 1590 c1[i] += r1; 1591 c2[i] += r2; 1592 c3[i] += r3; 1593 } else { 1594 c1[i] = r1; 1595 c2[i] = r2; 1596 c3[i] = r3; 1597 } 1598 } 1599 } 1600 } 1601 PetscCall(PetscLogFlops(cn*(2.0*a->nz))); 1602 if (add) { 1603 PetscCall(MatDenseRestoreArray(C,&c)); 1604 } else { 1605 PetscCall(MatDenseRestoreArrayWrite(C,&c)); 1606 } 1607 PetscCall(MatDenseRestoreArrayRead(B,&b)); 1608 PetscCall(MatSeqAIJRestoreArrayRead(A,&av)); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1613 { 1614 PetscFunctionBegin; 1615 PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 1616 PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 1617 PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 1618 1619 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE)); 1620 PetscFunctionReturn(0); 1621 } 1622 1623 /* ------------------------------------------------------- */ 1624 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1625 { 1626 PetscFunctionBegin; 1627 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1628 C->ops->productsymbolic = MatProductSymbolic_AB; 1629 PetscFunctionReturn(0); 1630 } 1631 1632 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat); 1633 1634 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1635 { 1636 PetscFunctionBegin; 1637 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1638 C->ops->productsymbolic = MatProductSymbolic_AtB; 1639 PetscFunctionReturn(0); 1640 } 1641 1642 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1643 { 1644 PetscFunctionBegin; 1645 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1646 C->ops->productsymbolic = MatProductSymbolic_ABt; 1647 PetscFunctionReturn(0); 1648 } 1649 1650 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1651 { 1652 Mat_Product *product = C->product; 1653 1654 PetscFunctionBegin; 1655 switch (product->type) { 1656 case MATPRODUCT_AB: 1657 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1658 break; 1659 case MATPRODUCT_AtB: 1660 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1661 break; 1662 case MATPRODUCT_ABt: 1663 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1664 break; 1665 default: 1666 break; 1667 } 1668 PetscFunctionReturn(0); 1669 } 1670 /* ------------------------------------------------------- */ 1671 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1672 { 1673 Mat_Product *product = C->product; 1674 Mat A = product->A; 1675 PetscBool baij; 1676 1677 PetscFunctionBegin; 1678 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij)); 1679 if (!baij) { /* A is seqsbaij */ 1680 PetscBool sbaij; 1681 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij)); 1682 PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format"); 1683 1684 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1685 } else { /* A is seqbaij */ 1686 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1687 } 1688 1689 C->ops->productsymbolic = MatProductSymbolic_AB; 1690 PetscFunctionReturn(0); 1691 } 1692 1693 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1694 { 1695 Mat_Product *product = C->product; 1696 1697 PetscFunctionBegin; 1698 MatCheckProduct(C,1); 1699 PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1700 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) { 1701 PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1702 } 1703 PetscFunctionReturn(0); 1704 } 1705 1706 /* ------------------------------------------------------- */ 1707 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1708 { 1709 PetscFunctionBegin; 1710 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1711 C->ops->productsymbolic = MatProductSymbolic_AB; 1712 PetscFunctionReturn(0); 1713 } 1714 1715 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1716 { 1717 Mat_Product *product = C->product; 1718 1719 PetscFunctionBegin; 1720 if (product->type == MATPRODUCT_AB) { 1721 PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1722 } 1723 PetscFunctionReturn(0); 1724 } 1725 /* ------------------------------------------------------- */ 1726 1727 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1728 { 1729 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1730 Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 1731 PetscInt *bi = b->i,*bj=b->j; 1732 PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 1733 MatScalar *btval,*btval_den,*ba=b->a; 1734 PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1735 1736 PetscFunctionBegin; 1737 btval_den=btdense->v; 1738 PetscCall(PetscArrayzero(btval_den,m*n)); 1739 for (k=0; k<ncolors; k++) { 1740 ncolumns = coloring->ncolumns[k]; 1741 for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1742 col = *(columns + colorforcol[k] + l); 1743 btcol = bj + bi[col]; 1744 btval = ba + bi[col]; 1745 anz = bi[col+1] - bi[col]; 1746 for (j=0; j<anz; j++) { 1747 brow = btcol[j]; 1748 btval_den[brow] = btval[j]; 1749 } 1750 } 1751 btval_den += m; 1752 } 1753 PetscFunctionReturn(0); 1754 } 1755 1756 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 1757 { 1758 Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 1759 const PetscScalar *ca_den,*ca_den_ptr; 1760 PetscScalar *ca=csp->a; 1761 PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1762 PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1763 PetscInt nrows,*row,*idx; 1764 PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 1765 1766 PetscFunctionBegin; 1767 PetscCall(MatDenseGetArrayRead(Cden,&ca_den)); 1768 1769 if (brows > 0) { 1770 PetscInt *lstart,row_end,row_start; 1771 lstart = matcoloring->lstart; 1772 PetscCall(PetscArrayzero(lstart,ncolors)); 1773 1774 row_end = brows; 1775 if (row_end > m) row_end = m; 1776 for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1777 ca_den_ptr = ca_den; 1778 for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1779 nrows = matcoloring->nrows[k]; 1780 row = rows + colorforrow[k]; 1781 idx = den2sp + colorforrow[k]; 1782 for (l=lstart[k]; l<nrows; l++) { 1783 if (row[l] >= row_end) { 1784 lstart[k] = l; 1785 break; 1786 } else { 1787 ca[idx[l]] = ca_den_ptr[row[l]]; 1788 } 1789 } 1790 ca_den_ptr += m; 1791 } 1792 row_end += brows; 1793 if (row_end > m) row_end = m; 1794 } 1795 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1796 ca_den_ptr = ca_den; 1797 for (k=0; k<ncolors; k++) { 1798 nrows = matcoloring->nrows[k]; 1799 row = rows + colorforrow[k]; 1800 idx = den2sp + colorforrow[k]; 1801 for (l=0; l<nrows; l++) { 1802 ca[idx[l]] = ca_den_ptr[row[l]]; 1803 } 1804 ca_den_ptr += m; 1805 } 1806 } 1807 1808 PetscCall(MatDenseRestoreArrayRead(Cden,&ca_den)); 1809 #if defined(PETSC_USE_INFO) 1810 if (matcoloring->brows > 0) { 1811 PetscCall(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows)); 1812 } else { 1813 PetscCall(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1814 } 1815 #endif 1816 PetscFunctionReturn(0); 1817 } 1818 1819 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1820 { 1821 PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 1822 const PetscInt *is,*ci,*cj,*row_idx; 1823 PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1824 IS *isa; 1825 Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1826 PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1827 PetscInt *colorforcol,*columns,*columns_i,brows; 1828 PetscBool flg; 1829 1830 PetscFunctionBegin; 1831 PetscCall(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa)); 1832 1833 /* bs >1 is not being tested yet! */ 1834 Nbs = mat->cmap->N/bs; 1835 c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1836 c->N = Nbs; 1837 c->m = c->M; 1838 c->rstart = 0; 1839 c->brows = 100; 1840 1841 c->ncolors = nis; 1842 PetscCall(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow)); 1843 PetscCall(PetscMalloc1(csp->nz+1,&rows)); 1844 PetscCall(PetscMalloc1(csp->nz+1,&den2sp)); 1845 1846 brows = c->brows; 1847 PetscCall(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg)); 1848 if (flg) c->brows = brows; 1849 if (brows > 0) { 1850 PetscCall(PetscMalloc1(nis+1,&c->lstart)); 1851 } 1852 1853 colorforrow[0] = 0; 1854 rows_i = rows; 1855 den2sp_i = den2sp; 1856 1857 PetscCall(PetscMalloc1(nis+1,&colorforcol)); 1858 PetscCall(PetscMalloc1(Nbs+1,&columns)); 1859 1860 colorforcol[0] = 0; 1861 columns_i = columns; 1862 1863 /* get column-wise storage of mat */ 1864 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL)); 1865 1866 cm = c->m; 1867 PetscCall(PetscMalloc1(cm+1,&rowhit)); 1868 PetscCall(PetscMalloc1(cm+1,&idxhit)); 1869 for (i=0; i<nis; i++) { /* loop over color */ 1870 PetscCall(ISGetLocalSize(isa[i],&n)); 1871 PetscCall(ISGetIndices(isa[i],&is)); 1872 1873 c->ncolumns[i] = n; 1874 if (n) { 1875 PetscCall(PetscArraycpy(columns_i,is,n)); 1876 } 1877 colorforcol[i+1] = colorforcol[i] + n; 1878 columns_i += n; 1879 1880 /* fast, crude version requires O(N*N) work */ 1881 PetscCall(PetscArrayzero(rowhit,cm)); 1882 1883 for (j=0; j<n; j++) { /* loop over columns*/ 1884 col = is[j]; 1885 row_idx = cj + ci[col]; 1886 m = ci[col+1] - ci[col]; 1887 for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1888 idxhit[*row_idx] = spidx[ci[col] + k]; 1889 rowhit[*row_idx++] = col + 1; 1890 } 1891 } 1892 /* count the number of hits */ 1893 nrows = 0; 1894 for (j=0; j<cm; j++) { 1895 if (rowhit[j]) nrows++; 1896 } 1897 c->nrows[i] = nrows; 1898 colorforrow[i+1] = colorforrow[i] + nrows; 1899 1900 nrows = 0; 1901 for (j=0; j<cm; j++) { /* loop over rows */ 1902 if (rowhit[j]) { 1903 rows_i[nrows] = j; 1904 den2sp_i[nrows] = idxhit[j]; 1905 nrows++; 1906 } 1907 } 1908 den2sp_i += nrows; 1909 1910 PetscCall(ISRestoreIndices(isa[i],&is)); 1911 rows_i += nrows; 1912 } 1913 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL)); 1914 PetscCall(PetscFree(rowhit)); 1915 PetscCall(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa)); 1916 PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]); 1917 1918 c->colorforrow = colorforrow; 1919 c->rows = rows; 1920 c->den2sp = den2sp; 1921 c->colorforcol = colorforcol; 1922 c->columns = columns; 1923 1924 PetscCall(PetscFree(idxhit)); 1925 PetscFunctionReturn(0); 1926 } 1927 1928 /* --------------------------------------------------------------- */ 1929 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1930 { 1931 Mat_Product *product = C->product; 1932 Mat A=product->A,B=product->B; 1933 1934 PetscFunctionBegin; 1935 if (C->ops->mattransposemultnumeric) { 1936 /* Alg: "outerproduct" */ 1937 PetscCall((*C->ops->mattransposemultnumeric)(A,B,C)); 1938 } else { 1939 /* Alg: "matmatmult" -- C = At*B */ 1940 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1941 Mat At; 1942 1943 PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1944 At = atb->At; 1945 if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */ 1946 PetscCall(MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At)); 1947 } 1948 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C)); 1949 atb->updateAt = PETSC_TRUE; 1950 } 1951 PetscFunctionReturn(0); 1952 } 1953 1954 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1955 { 1956 Mat_Product *product = C->product; 1957 Mat A=product->A,B=product->B; 1958 PetscReal fill=product->fill; 1959 1960 PetscFunctionBegin; 1961 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C)); 1962 1963 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1964 PetscFunctionReturn(0); 1965 } 1966 1967 /* --------------------------------------------------------------- */ 1968 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1969 { 1970 Mat_Product *product = C->product; 1971 PetscInt alg = 0; /* default algorithm */ 1972 PetscBool flg = PETSC_FALSE; 1973 #if !defined(PETSC_HAVE_HYPRE) 1974 const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 1975 PetscInt nalg = 7; 1976 #else 1977 const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"}; 1978 PetscInt nalg = 8; 1979 #endif 1980 1981 PetscFunctionBegin; 1982 /* Set default algorithm */ 1983 PetscCall(PetscStrcmp(C->product->alg,"default",&flg)); 1984 if (flg) { 1985 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 1986 } 1987 1988 /* Get runtime option */ 1989 if (product->api_user) { 1990 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat"); 1991 PetscCall(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg)); 1992 PetscOptionsEnd(); 1993 } else { 1994 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat"); 1995 PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg)); 1996 PetscOptionsEnd(); 1997 } 1998 if (flg) { 1999 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2000 } 2001 2002 C->ops->productsymbolic = MatProductSymbolic_AB; 2003 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2004 PetscFunctionReturn(0); 2005 } 2006 2007 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2008 { 2009 Mat_Product *product = C->product; 2010 PetscInt alg = 0; /* default algorithm */ 2011 PetscBool flg = PETSC_FALSE; 2012 const char *algTypes[3] = {"default","at*b","outerproduct"}; 2013 PetscInt nalg = 3; 2014 2015 PetscFunctionBegin; 2016 /* Get runtime option */ 2017 if (product->api_user) { 2018 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat"); 2019 PetscCall(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg)); 2020 PetscOptionsEnd(); 2021 } else { 2022 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat"); 2023 PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg)); 2024 PetscOptionsEnd(); 2025 } 2026 if (flg) { 2027 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2028 } 2029 2030 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2031 PetscFunctionReturn(0); 2032 } 2033 2034 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2035 { 2036 Mat_Product *product = C->product; 2037 PetscInt alg = 0; /* default algorithm */ 2038 PetscBool flg = PETSC_FALSE; 2039 const char *algTypes[2] = {"default","color"}; 2040 PetscInt nalg = 2; 2041 2042 PetscFunctionBegin; 2043 /* Set default algorithm */ 2044 PetscCall(PetscStrcmp(C->product->alg,"default",&flg)); 2045 if (!flg) { 2046 alg = 1; 2047 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2048 } 2049 2050 /* Get runtime option */ 2051 if (product->api_user) { 2052 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat"); 2053 PetscCall(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg)); 2054 PetscOptionsEnd(); 2055 } else { 2056 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat"); 2057 PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg)); 2058 PetscOptionsEnd(); 2059 } 2060 if (flg) { 2061 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2062 } 2063 2064 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2065 C->ops->productsymbolic = MatProductSymbolic_ABt; 2066 PetscFunctionReturn(0); 2067 } 2068 2069 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2070 { 2071 Mat_Product *product = C->product; 2072 PetscBool flg = PETSC_FALSE; 2073 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2074 #if !defined(PETSC_HAVE_HYPRE) 2075 const char *algTypes[2] = {"scalable","rap"}; 2076 PetscInt nalg = 2; 2077 #else 2078 const char *algTypes[3] = {"scalable","rap","hypre"}; 2079 PetscInt nalg = 3; 2080 #endif 2081 2082 PetscFunctionBegin; 2083 /* Set default algorithm */ 2084 PetscCall(PetscStrcmp(product->alg,"default",&flg)); 2085 if (flg) { 2086 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2087 } 2088 2089 /* Get runtime option */ 2090 if (product->api_user) { 2091 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat"); 2092 PetscCall(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg)); 2093 PetscOptionsEnd(); 2094 } else { 2095 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat"); 2096 PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg)); 2097 PetscOptionsEnd(); 2098 } 2099 if (flg) { 2100 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2101 } 2102 2103 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2104 PetscFunctionReturn(0); 2105 } 2106 2107 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2108 { 2109 Mat_Product *product = C->product; 2110 PetscBool flg = PETSC_FALSE; 2111 PetscInt alg = 0; /* default algorithm */ 2112 const char *algTypes[3] = {"r*a*rt","r*art","coloring_rart"}; 2113 PetscInt nalg = 3; 2114 2115 PetscFunctionBegin; 2116 /* Set default algorithm */ 2117 PetscCall(PetscStrcmp(product->alg,"default",&flg)); 2118 if (flg) { 2119 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2120 } 2121 2122 /* Get runtime option */ 2123 if (product->api_user) { 2124 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat"); 2125 PetscCall(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg)); 2126 PetscOptionsEnd(); 2127 } else { 2128 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat"); 2129 PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg)); 2130 PetscOptionsEnd(); 2131 } 2132 if (flg) { 2133 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2134 } 2135 2136 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2137 PetscFunctionReturn(0); 2138 } 2139 2140 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2141 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2142 { 2143 Mat_Product *product = C->product; 2144 PetscInt alg = 0; /* default algorithm */ 2145 PetscBool flg = PETSC_FALSE; 2146 const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 2147 PetscInt nalg = 7; 2148 2149 PetscFunctionBegin; 2150 /* Set default algorithm */ 2151 PetscCall(PetscStrcmp(product->alg,"default",&flg)); 2152 if (flg) { 2153 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2154 } 2155 2156 /* Get runtime option */ 2157 if (product->api_user) { 2158 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat"); 2159 PetscCall(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg)); 2160 PetscOptionsEnd(); 2161 } else { 2162 PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat"); 2163 PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg)); 2164 PetscOptionsEnd(); 2165 } 2166 if (flg) { 2167 PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2168 } 2169 2170 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2171 C->ops->productsymbolic = MatProductSymbolic_ABC; 2172 PetscFunctionReturn(0); 2173 } 2174 2175 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2176 { 2177 Mat_Product *product = C->product; 2178 2179 PetscFunctionBegin; 2180 switch (product->type) { 2181 case MATPRODUCT_AB: 2182 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2183 break; 2184 case MATPRODUCT_AtB: 2185 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2186 break; 2187 case MATPRODUCT_ABt: 2188 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2189 break; 2190 case MATPRODUCT_PtAP: 2191 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2192 break; 2193 case MATPRODUCT_RARt: 2194 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2195 break; 2196 case MATPRODUCT_ABC: 2197 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2198 break; 2199 default: 2200 break; 2201 } 2202 PetscFunctionReturn(0); 2203 } 2204