1 2 /* 3 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4 C = A * B 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <petscbt.h> 10 #include <petsc/private/isimpl.h> 11 #include <../src/mat/impls/dense/seq/dense.h> 12 13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 14 { 15 PetscErrorCode ierr; 16 17 PetscFunctionBegin; 18 if (C->ops->matmultnumeric) { 19 if (C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call"); 20 ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr); 21 } else { 22 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr); 23 } 24 PetscFunctionReturn(0); 25 } 26 27 /* Modified from MatCreateSeqAIJWithArrays() */ 28 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat) 29 { 30 PetscErrorCode ierr; 31 PetscInt ii; 32 Mat_SeqAIJ *aij; 33 PetscBool isseqaij; 34 35 PetscFunctionBegin; 36 if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0"); 37 ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr); 38 39 if (!mtype) { 40 ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 41 if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); } 42 } else { 43 ierr = MatSetType(mat,mtype);CHKERRQ(ierr); 44 } 45 ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 46 aij = (Mat_SeqAIJ*)(mat)->data; 47 ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr); 48 ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr); 49 50 aij->i = i; 51 aij->j = j; 52 aij->a = a; 53 aij->singlemalloc = PETSC_FALSE; 54 aij->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 55 aij->free_a = PETSC_FALSE; 56 aij->free_ij = PETSC_FALSE; 57 58 for (ii=0; ii<m; ii++) { 59 aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii]; 60 } 61 PetscFunctionReturn(0); 62 } 63 64 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 65 { 66 PetscErrorCode ierr; 67 Mat_Product *product = C->product; 68 MatProductAlgorithm alg; 69 PetscBool flg; 70 71 PetscFunctionBegin; 72 if (product) { 73 alg = product->alg; 74 } else { 75 alg = "sorted"; 76 } 77 /* sorted */ 78 ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr); 79 if (flg) { 80 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr); 81 PetscFunctionReturn(0); 82 } 83 84 /* scalable */ 85 ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr); 86 if (flg) { 87 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr); 88 PetscFunctionReturn(0); 89 } 90 91 /* scalable_fast */ 92 ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr); 93 if (flg) { 94 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr); 95 PetscFunctionReturn(0); 96 } 97 98 /* heap */ 99 ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr); 100 if (flg) { 101 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr); 102 PetscFunctionReturn(0); 103 } 104 105 /* btheap */ 106 ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr); 107 if (flg) { 108 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr); 109 PetscFunctionReturn(0); 110 } 111 112 /* llcondensed */ 113 ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr); 114 if (flg) { 115 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 119 /* rowmerge */ 120 ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr); 121 if (flg) { 122 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr); 123 PetscFunctionReturn(0); 124 } 125 126 #if defined(PETSC_HAVE_HYPRE) 127 ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr); 128 if (flg) { 129 ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr); 130 PetscFunctionReturn(0); 131 } 132 #endif 133 134 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 135 } 136 137 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C) 138 { 139 PetscErrorCode ierr; 140 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 141 PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 142 PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 143 PetscReal afill; 144 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 145 PetscTable ta; 146 PetscBT lnkbt; 147 PetscFreeSpaceList free_space=NULL,current_space=NULL; 148 149 PetscFunctionBegin; 150 /* Get ci and cj */ 151 /*---------------*/ 152 /* Allocate ci array, arrays for fill computation and */ 153 /* free space for accumulating nonzero column info */ 154 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 155 ci[0] = 0; 156 157 /* create and initialize a linked list */ 158 ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 159 MatRowMergeMax_SeqAIJ(b,bm,ta); 160 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 161 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 162 163 ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr); 164 165 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 166 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 167 168 current_space = free_space; 169 170 /* Determine ci and cj */ 171 for (i=0; i<am; i++) { 172 anzi = ai[i+1] - ai[i]; 173 aj = a->j + ai[i]; 174 for (j=0; j<anzi; j++) { 175 brow = aj[j]; 176 bnzj = bi[brow+1] - bi[brow]; 177 bj = b->j + bi[brow]; 178 /* add non-zero cols of B into the sorted linked list lnk */ 179 ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr); 180 } 181 /* add possible missing diagonal entry */ 182 if (C->force_diagonals) { 183 ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr); 184 } 185 cnzi = lnk[0]; 186 187 /* If free space is not available, make more free space */ 188 /* Double the amount of total space in the list */ 189 if (current_space->local_remaining<cnzi) { 190 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 191 ndouble++; 192 } 193 194 /* Copy data into free space, then initialize lnk */ 195 ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 196 197 current_space->array += cnzi; 198 current_space->local_used += cnzi; 199 current_space->local_remaining -= cnzi; 200 201 ci[i+1] = ci[i] + cnzi; 202 } 203 204 /* Column indices are in the list of free space */ 205 /* Allocate space for cj, initialize cj, and */ 206 /* destroy list of free space and other temporary array(s) */ 207 ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 208 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 209 ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 210 211 /* put together the new symbolic matrix */ 212 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 213 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 214 215 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 216 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 217 c = (Mat_SeqAIJ*)(C->data); 218 c->free_a = PETSC_FALSE; 219 c->free_ij = PETSC_TRUE; 220 c->nonew = 0; 221 222 /* fast, needs non-scalable O(bn) array 'abdense' */ 223 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 224 225 /* set MatInfo */ 226 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 227 if (afill < 1.0) afill = 1.0; 228 c->maxnz = ci[am]; 229 c->nz = ci[am]; 230 C->info.mallocs = ndouble; 231 C->info.fill_ratio_given = fill; 232 C->info.fill_ratio_needed = afill; 233 234 #if defined(PETSC_USE_INFO) 235 if (ci[am]) { 236 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 237 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 238 } else { 239 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 240 } 241 #endif 242 PetscFunctionReturn(0); 243 } 244 245 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C) 246 { 247 PetscErrorCode ierr; 248 PetscLogDouble flops=0.0; 249 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 250 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 251 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 252 PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 253 PetscInt am =A->rmap->n,cm=C->rmap->n; 254 PetscInt i,j,k,anzi,bnzi,cnzi,brow; 255 PetscScalar *ca,valtmp; 256 PetscScalar *ab_dense; 257 PetscContainer cab_dense; 258 const PetscScalar *aa,*ba,*baj; 259 260 PetscFunctionBegin; 261 ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 262 ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 263 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 264 ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 265 c->a = ca; 266 c->free_a = PETSC_TRUE; 267 } else ca = c->a; 268 269 /* TODO this should be done in the symbolic phase */ 270 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 271 that is hard to eradicate) */ 272 ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr); 273 if (!cab_dense) { 274 ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr); 275 ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr); 276 ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr); 277 ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr); 278 ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr); 279 ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr); 280 } 281 ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr); 282 ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr); 283 284 /* clean old values in C */ 285 ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 286 /* Traverse A row-wise. */ 287 /* Build the ith row in C by summing over nonzero columns in A, */ 288 /* the rows of B corresponding to nonzeros of A. */ 289 for (i=0; i<am; i++) { 290 anzi = ai[i+1] - ai[i]; 291 for (j=0; j<anzi; j++) { 292 brow = aj[j]; 293 bnzi = bi[brow+1] - bi[brow]; 294 bjj = bj + bi[brow]; 295 baj = ba + bi[brow]; 296 /* perform dense axpy */ 297 valtmp = aa[j]; 298 for (k=0; k<bnzi; k++) { 299 ab_dense[bjj[k]] += valtmp*baj[k]; 300 } 301 flops += 2*bnzi; 302 } 303 aj += anzi; aa += anzi; 304 305 cnzi = ci[i+1] - ci[i]; 306 for (k=0; k<cnzi; k++) { 307 ca[k] += ab_dense[cj[k]]; 308 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 309 } 310 flops += cnzi; 311 cj += cnzi; ca += cnzi; 312 } 313 #if defined(PETSC_HAVE_DEVICE) 314 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 315 #endif 316 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 317 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 318 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 319 ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 320 ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 321 PetscFunctionReturn(0); 322 } 323 324 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 325 { 326 PetscErrorCode ierr; 327 PetscLogDouble flops=0.0; 328 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 329 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 330 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 331 PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 332 PetscInt am = A->rmap->N,cm=C->rmap->N; 333 PetscInt i,j,k,anzi,bnzi,cnzi,brow; 334 PetscScalar *ca=c->a,valtmp; 335 const PetscScalar *aa,*ba,*baj; 336 PetscInt nextb; 337 338 PetscFunctionBegin; 339 ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 340 ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 341 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 342 ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 343 c->a = ca; 344 c->free_a = PETSC_TRUE; 345 } 346 347 /* clean old values in C */ 348 ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 349 /* Traverse A row-wise. */ 350 /* Build the ith row in C by summing over nonzero columns in A, */ 351 /* the rows of B corresponding to nonzeros of A. */ 352 for (i=0; i<am; i++) { 353 anzi = ai[i+1] - ai[i]; 354 cnzi = ci[i+1] - ci[i]; 355 for (j=0; j<anzi; j++) { 356 brow = aj[j]; 357 bnzi = bi[brow+1] - bi[brow]; 358 bjj = bj + bi[brow]; 359 baj = ba + bi[brow]; 360 /* perform sparse axpy */ 361 valtmp = aa[j]; 362 nextb = 0; 363 for (k=0; nextb<bnzi; k++) { 364 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 365 ca[k] += valtmp*baj[nextb++]; 366 } 367 } 368 flops += 2*bnzi; 369 } 370 aj += anzi; aa += anzi; 371 cj += cnzi; ca += cnzi; 372 } 373 #if defined(PETSC_HAVE_DEVICE) 374 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 375 #endif 376 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 377 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 378 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 379 ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 380 ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C) 385 { 386 PetscErrorCode ierr; 387 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 388 PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 389 PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 390 MatScalar *ca; 391 PetscReal afill; 392 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 393 PetscTable ta; 394 PetscFreeSpaceList free_space=NULL,current_space=NULL; 395 396 PetscFunctionBegin; 397 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 398 /*-----------------------------------------------------------------------------------------*/ 399 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 400 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 401 ci[0] = 0; 402 403 /* create and initialize a linked list */ 404 ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 405 MatRowMergeMax_SeqAIJ(b,bm,ta); 406 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 407 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 408 409 ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr); 410 411 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 412 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 413 current_space = free_space; 414 415 /* Determine ci and cj */ 416 for (i=0; i<am; i++) { 417 anzi = ai[i+1] - ai[i]; 418 aj = a->j + ai[i]; 419 for (j=0; j<anzi; j++) { 420 brow = aj[j]; 421 bnzj = bi[brow+1] - bi[brow]; 422 bj = b->j + bi[brow]; 423 /* add non-zero cols of B into the sorted linked list lnk */ 424 ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr); 425 } 426 /* add possible missing diagonal entry */ 427 if (C->force_diagonals) { 428 ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr); 429 } 430 cnzi = lnk[1]; 431 432 /* If free space is not available, make more free space */ 433 /* Double the amount of total space in the list */ 434 if (current_space->local_remaining<cnzi) { 435 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 436 ndouble++; 437 } 438 439 /* Copy data into free space, then initialize lnk */ 440 ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr); 441 442 current_space->array += cnzi; 443 current_space->local_used += cnzi; 444 current_space->local_remaining -= cnzi; 445 446 ci[i+1] = ci[i] + cnzi; 447 } 448 449 /* Column indices are in the list of free space */ 450 /* Allocate space for cj, initialize cj, and */ 451 /* destroy list of free space and other temporary array(s) */ 452 ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 453 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 454 ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr); 455 456 /* Allocate space for ca */ 457 ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 458 459 /* put together the new symbolic matrix */ 460 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 461 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 462 463 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 464 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 465 c = (Mat_SeqAIJ*)(C->data); 466 c->free_a = PETSC_TRUE; 467 c->free_ij = PETSC_TRUE; 468 c->nonew = 0; 469 470 /* slower, less memory */ 471 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 472 473 /* set MatInfo */ 474 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 475 if (afill < 1.0) afill = 1.0; 476 c->maxnz = ci[am]; 477 c->nz = ci[am]; 478 C->info.mallocs = ndouble; 479 C->info.fill_ratio_given = fill; 480 C->info.fill_ratio_needed = afill; 481 482 #if defined(PETSC_USE_INFO) 483 if (ci[am]) { 484 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 485 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 486 } else { 487 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 488 } 489 #endif 490 PetscFunctionReturn(0); 491 } 492 493 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C) 494 { 495 PetscErrorCode ierr; 496 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 497 PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 498 PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 499 MatScalar *ca; 500 PetscReal afill; 501 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 502 PetscTable ta; 503 PetscFreeSpaceList free_space=NULL,current_space=NULL; 504 505 PetscFunctionBegin; 506 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 507 /*---------------------------------------------------------------------------------------------*/ 508 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 509 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 510 ci[0] = 0; 511 512 /* create and initialize a linked list */ 513 ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 514 MatRowMergeMax_SeqAIJ(b,bm,ta); 515 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 516 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 517 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 518 519 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 520 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 521 current_space = free_space; 522 523 /* Determine ci and cj */ 524 for (i=0; i<am; i++) { 525 anzi = ai[i+1] - ai[i]; 526 aj = a->j + ai[i]; 527 for (j=0; j<anzi; j++) { 528 brow = aj[j]; 529 bnzj = bi[brow+1] - bi[brow]; 530 bj = b->j + bi[brow]; 531 /* add non-zero cols of B into the sorted linked list lnk */ 532 ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr); 533 } 534 /* add possible missing diagonal entry */ 535 if (C->force_diagonals) { 536 ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr); 537 } 538 539 cnzi = lnk[0]; 540 541 /* If free space is not available, make more free space */ 542 /* Double the amount of total space in the list */ 543 if (current_space->local_remaining<cnzi) { 544 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 545 ndouble++; 546 } 547 548 /* Copy data into free space, then initialize lnk */ 549 ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr); 550 551 current_space->array += cnzi; 552 current_space->local_used += cnzi; 553 current_space->local_remaining -= cnzi; 554 555 ci[i+1] = ci[i] + cnzi; 556 } 557 558 /* Column indices are in the list of free space */ 559 /* Allocate space for cj, initialize cj, and */ 560 /* destroy list of free space and other temporary array(s) */ 561 ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 562 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 563 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 564 565 /* Allocate space for ca */ 566 /*-----------------------*/ 567 ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 568 569 /* put together the new symbolic matrix */ 570 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 571 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 572 573 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 574 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 575 c = (Mat_SeqAIJ*)(C->data); 576 c->free_a = PETSC_TRUE; 577 c->free_ij = PETSC_TRUE; 578 c->nonew = 0; 579 580 /* slower, less memory */ 581 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 582 583 /* set MatInfo */ 584 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 585 if (afill < 1.0) afill = 1.0; 586 c->maxnz = ci[am]; 587 c->nz = ci[am]; 588 C->info.mallocs = ndouble; 589 C->info.fill_ratio_given = fill; 590 C->info.fill_ratio_needed = afill; 591 592 #if defined(PETSC_USE_INFO) 593 if (ci[am]) { 594 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 595 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 596 } else { 597 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 598 } 599 #endif 600 PetscFunctionReturn(0); 601 } 602 603 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C) 604 { 605 PetscErrorCode ierr; 606 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 607 const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 608 PetscInt *ci,*cj,*bb; 609 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 610 PetscReal afill; 611 PetscInt i,j,col,ndouble = 0; 612 PetscFreeSpaceList free_space=NULL,current_space=NULL; 613 PetscHeap h; 614 615 PetscFunctionBegin; 616 /* Get ci and cj - by merging sorted rows using a heap */ 617 /*---------------------------------------------------------------------------------------------*/ 618 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 619 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 620 ci[0] = 0; 621 622 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 623 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 624 current_space = free_space; 625 626 ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 627 ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 628 629 /* Determine ci and cj */ 630 for (i=0; i<am; i++) { 631 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 632 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 633 ci[i+1] = ci[i]; 634 /* Populate the min heap */ 635 for (j=0; j<anzi; j++) { 636 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 637 if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 638 ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr); 639 } 640 } 641 /* Pick off the min element, adding it to free space */ 642 ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 643 while (j >= 0) { 644 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 645 ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 646 ndouble++; 647 } 648 *(current_space->array++) = col; 649 current_space->local_used++; 650 current_space->local_remaining--; 651 ci[i+1]++; 652 653 /* stash if anything else remains in this row of B */ 654 if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);} 655 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 656 PetscInt j2,col2; 657 ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr); 658 if (col2 != col) break; 659 ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr); 660 if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);} 661 } 662 /* Put any stashed elements back into the min heap */ 663 ierr = PetscHeapUnstash(h);CHKERRQ(ierr); 664 ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 665 } 666 } 667 ierr = PetscFree(bb);CHKERRQ(ierr); 668 ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 669 670 /* Column indices are in the list of free space */ 671 /* Allocate space for cj, initialize cj, and */ 672 /* destroy list of free space and other temporary array(s) */ 673 ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 674 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 675 676 /* put together the new symbolic matrix */ 677 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 678 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 679 680 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 681 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 682 c = (Mat_SeqAIJ*)(C->data); 683 c->free_a = PETSC_TRUE; 684 c->free_ij = PETSC_TRUE; 685 c->nonew = 0; 686 687 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 688 689 /* set MatInfo */ 690 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 691 if (afill < 1.0) afill = 1.0; 692 c->maxnz = ci[am]; 693 c->nz = ci[am]; 694 C->info.mallocs = ndouble; 695 C->info.fill_ratio_given = fill; 696 C->info.fill_ratio_needed = afill; 697 698 #if defined(PETSC_USE_INFO) 699 if (ci[am]) { 700 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 701 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 702 } else { 703 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 704 } 705 #endif 706 PetscFunctionReturn(0); 707 } 708 709 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C) 710 { 711 PetscErrorCode ierr; 712 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 713 const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 714 PetscInt *ci,*cj,*bb; 715 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 716 PetscReal afill; 717 PetscInt i,j,col,ndouble = 0; 718 PetscFreeSpaceList free_space=NULL,current_space=NULL; 719 PetscHeap h; 720 PetscBT bt; 721 722 PetscFunctionBegin; 723 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 724 /*---------------------------------------------------------------------------------------------*/ 725 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 726 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 727 ci[0] = 0; 728 729 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 730 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 731 732 current_space = free_space; 733 734 ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 735 ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 736 ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr); 737 738 /* Determine ci and cj */ 739 for (i=0; i<am; i++) { 740 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 741 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 742 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 743 ci[i+1] = ci[i]; 744 /* Populate the min heap */ 745 for (j=0; j<anzi; j++) { 746 PetscInt brow = acol[j]; 747 for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 748 PetscInt bcol = bj[bb[j]]; 749 if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 750 ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 751 bb[j]++; 752 break; 753 } 754 } 755 } 756 /* Pick off the min element, adding it to free space */ 757 ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 758 while (j >= 0) { 759 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 760 fptr = NULL; /* need PetscBTMemzero */ 761 ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 762 ndouble++; 763 } 764 *(current_space->array++) = col; 765 current_space->local_used++; 766 current_space->local_remaining--; 767 ci[i+1]++; 768 769 /* stash if anything else remains in this row of B */ 770 for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 771 PetscInt bcol = bj[bb[j]]; 772 if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 773 ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 774 bb[j]++; 775 break; 776 } 777 } 778 ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 779 } 780 if (fptr) { /* Clear the bits for this row */ 781 for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);} 782 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 783 ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr); 784 } 785 } 786 ierr = PetscFree(bb);CHKERRQ(ierr); 787 ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 788 ierr = PetscBTDestroy(&bt);CHKERRQ(ierr); 789 790 /* Column indices are in the list of free space */ 791 /* Allocate space for cj, initialize cj, and */ 792 /* destroy list of free space and other temporary array(s) */ 793 ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 794 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 795 796 /* put together the new symbolic matrix */ 797 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 798 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 799 800 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 801 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 802 c = (Mat_SeqAIJ*)(C->data); 803 c->free_a = PETSC_TRUE; 804 c->free_ij = PETSC_TRUE; 805 c->nonew = 0; 806 807 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 808 809 /* set MatInfo */ 810 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 811 if (afill < 1.0) afill = 1.0; 812 c->maxnz = ci[am]; 813 c->nz = ci[am]; 814 C->info.mallocs = ndouble; 815 C->info.fill_ratio_given = fill; 816 C->info.fill_ratio_needed = afill; 817 818 #if defined(PETSC_USE_INFO) 819 if (ci[am]) { 820 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 821 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 822 } else { 823 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 824 } 825 #endif 826 PetscFunctionReturn(0); 827 } 828 829 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C) 830 { 831 PetscErrorCode ierr; 832 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 833 const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 834 PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 835 PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 836 const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 837 const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 838 const PetscInt *brow_ptr[8],*brow_end[8]; 839 PetscInt window[8]; 840 PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 841 PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 842 PetscReal afill; 843 PetscInt *workj_L1,*workj_L2,*workj_L3; 844 PetscInt L1_nnz,L2_nnz; 845 846 /* Step 1: Get upper bound on memory required for allocation. 847 Because of the way virtual memory works, 848 only the memory pages that are actually needed will be physically allocated. */ 849 PetscFunctionBegin; 850 ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 851 for (i=0; i<am; i++) { 852 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 853 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 854 a_rownnz = 0; 855 for (k=0; k<anzi; ++k) { 856 a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 857 if (a_rownnz > bn) { 858 a_rownnz = bn; 859 break; 860 } 861 } 862 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 863 } 864 /* temporary work areas for merging rows */ 865 ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr); 866 ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr); 867 ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr); 868 869 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 870 c_maxmem = 8*(ai[am]+bi[bm]); 871 /* Step 2: Populate pattern for C */ 872 ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 873 874 ci_nnz = 0; 875 ci[0] = 0; 876 worki_L1[0] = 0; 877 worki_L2[0] = 0; 878 for (i=0; i<am; i++) { 879 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 880 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 881 rowsleft = anzi; 882 inputcol_L1 = acol; 883 L2_nnz = 0; 884 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 885 worki_L2[1] = 0; 886 outputi_nnz = 0; 887 888 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 889 while (ci_nnz+a_maxrownnz > c_maxmem) { 890 c_maxmem *= 2; 891 ndouble++; 892 ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 893 } 894 895 while (rowsleft) { 896 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 897 L1_nrows = 0; 898 L1_nnz = 0; 899 inputcol = inputcol_L1; 900 inputi = bi; 901 inputj = bj; 902 903 /* The following macro is used to specialize for small rows in A. 904 This helps with compiler unrolling, improving performance substantially. 905 Input: inputj inputi inputcol bn 906 Output: outputj outputi_nnz */ 907 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 908 window_min = bn; \ 909 outputi_nnz = 0; \ 910 for (k=0; k<ANNZ; ++k) { \ 911 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 912 brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 913 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 914 window_min = PetscMin(window[k], window_min); \ 915 } \ 916 while (window_min < bn) { \ 917 outputj[outputi_nnz++] = window_min; \ 918 /* advance front and compute new minimum */ \ 919 old_window_min = window_min; \ 920 window_min = bn; \ 921 for (k=0; k<ANNZ; ++k) { \ 922 if (window[k] == old_window_min) { \ 923 brow_ptr[k]++; \ 924 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 925 } \ 926 window_min = PetscMin(window[k], window_min); \ 927 } \ 928 } 929 930 /************** L E V E L 1 ***************/ 931 /* Merge up to 8 rows of B to L1 work array*/ 932 while (L1_rowsleft) { 933 outputi_nnz = 0; 934 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 935 else outputj = cj + ci_nnz; /* Merge directly to C */ 936 937 switch (L1_rowsleft) { 938 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 939 brow_end[0] = inputj + inputi[inputcol[0]+1]; 940 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 941 inputcol += L1_rowsleft; 942 rowsleft -= L1_rowsleft; 943 L1_rowsleft = 0; 944 break; 945 case 2: MatMatMultSymbolic_RowMergeMacro(2); 946 inputcol += L1_rowsleft; 947 rowsleft -= L1_rowsleft; 948 L1_rowsleft = 0; 949 break; 950 case 3: MatMatMultSymbolic_RowMergeMacro(3); 951 inputcol += L1_rowsleft; 952 rowsleft -= L1_rowsleft; 953 L1_rowsleft = 0; 954 break; 955 case 4: MatMatMultSymbolic_RowMergeMacro(4); 956 inputcol += L1_rowsleft; 957 rowsleft -= L1_rowsleft; 958 L1_rowsleft = 0; 959 break; 960 case 5: MatMatMultSymbolic_RowMergeMacro(5); 961 inputcol += L1_rowsleft; 962 rowsleft -= L1_rowsleft; 963 L1_rowsleft = 0; 964 break; 965 case 6: MatMatMultSymbolic_RowMergeMacro(6); 966 inputcol += L1_rowsleft; 967 rowsleft -= L1_rowsleft; 968 L1_rowsleft = 0; 969 break; 970 case 7: MatMatMultSymbolic_RowMergeMacro(7); 971 inputcol += L1_rowsleft; 972 rowsleft -= L1_rowsleft; 973 L1_rowsleft = 0; 974 break; 975 default: MatMatMultSymbolic_RowMergeMacro(8); 976 inputcol += 8; 977 rowsleft -= 8; 978 L1_rowsleft -= 8; 979 break; 980 } 981 inputcol_L1 = inputcol; 982 L1_nnz += outputi_nnz; 983 worki_L1[++L1_nrows] = L1_nnz; 984 } 985 986 /********************** L E V E L 2 ************************/ 987 /* Merge from L1 work array to either C or to L2 work array */ 988 if (anzi > 8) { 989 inputi = worki_L1; 990 inputj = workj_L1; 991 inputcol = workcol; 992 outputi_nnz = 0; 993 994 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 995 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 996 997 switch (L1_nrows) { 998 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 999 brow_end[0] = inputj + inputi[inputcol[0]+1]; 1000 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1001 break; 1002 case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1003 case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1004 case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1005 case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1006 case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1007 case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1008 case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1009 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1010 } 1011 L2_nnz += outputi_nnz; 1012 worki_L2[++L2_nrows] = L2_nnz; 1013 1014 /************************ L E V E L 3 **********************/ 1015 /* Merge from L2 work array to either C or to L2 work array */ 1016 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1017 inputi = worki_L2; 1018 inputj = workj_L2; 1019 inputcol = workcol; 1020 outputi_nnz = 0; 1021 if (rowsleft) outputj = workj_L3; 1022 else outputj = cj + ci_nnz; 1023 switch (L2_nrows) { 1024 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1025 brow_end[0] = inputj + inputi[inputcol[0]+1]; 1026 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1027 break; 1028 case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1029 case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1030 case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1031 case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1032 case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1033 case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1034 case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1035 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1036 } 1037 L2_nrows = 1; 1038 L2_nnz = outputi_nnz; 1039 worki_L2[1] = outputi_nnz; 1040 /* Copy to workj_L2 */ 1041 if (rowsleft) { 1042 for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1043 } 1044 } 1045 } 1046 } /* while (rowsleft) */ 1047 #undef MatMatMultSymbolic_RowMergeMacro 1048 1049 /* terminate current row */ 1050 ci_nnz += outputi_nnz; 1051 ci[i+1] = ci_nnz; 1052 } 1053 1054 /* Step 3: Create the new symbolic matrix */ 1055 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 1056 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1057 1058 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1059 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1060 c = (Mat_SeqAIJ*)(C->data); 1061 c->free_a = PETSC_TRUE; 1062 c->free_ij = PETSC_TRUE; 1063 c->nonew = 0; 1064 1065 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1066 1067 /* set MatInfo */ 1068 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1069 if (afill < 1.0) afill = 1.0; 1070 c->maxnz = ci[am]; 1071 c->nz = ci[am]; 1072 C->info.mallocs = ndouble; 1073 C->info.fill_ratio_given = fill; 1074 C->info.fill_ratio_needed = afill; 1075 1076 #if defined(PETSC_USE_INFO) 1077 if (ci[am]) { 1078 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 1079 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1080 } else { 1081 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1082 } 1083 #endif 1084 1085 /* Step 4: Free temporary work areas */ 1086 ierr = PetscFree(workj_L1);CHKERRQ(ierr); 1087 ierr = PetscFree(workj_L2);CHKERRQ(ierr); 1088 ierr = PetscFree(workj_L3);CHKERRQ(ierr); 1089 PetscFunctionReturn(0); 1090 } 1091 1092 /* concatenate unique entries and then sort */ 1093 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C) 1094 { 1095 PetscErrorCode ierr; 1096 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1097 const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 1098 PetscInt *ci,*cj,bcol; 1099 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1100 PetscReal afill; 1101 PetscInt i,j,ndouble = 0; 1102 PetscSegBuffer seg,segrow; 1103 char *seen; 1104 1105 PetscFunctionBegin; 1106 ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1107 ci[0] = 0; 1108 1109 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1110 ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr); 1111 ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr); 1112 ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr); 1113 1114 /* Determine ci and cj */ 1115 for (i=0; i<am; i++) { 1116 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1117 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1118 PetscInt packlen = 0,*PETSC_RESTRICT crow; 1119 1120 /* Pack segrow */ 1121 for (j=0; j<anzi; j++) { 1122 PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1123 for (k=bjstart; k<bjend; k++) { 1124 bcol = bj[k]; 1125 if (!seen[bcol]) { /* new entry */ 1126 PetscInt *PETSC_RESTRICT slot; 1127 ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1128 *slot = bcol; 1129 seen[bcol] = 1; 1130 packlen++; 1131 } 1132 } 1133 } 1134 1135 /* Check i-th diagonal entry */ 1136 if (C->force_diagonals && !seen[i]) { 1137 PetscInt *PETSC_RESTRICT slot; 1138 ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1139 *slot = i; 1140 seen[i] = 1; 1141 packlen++; 1142 } 1143 1144 ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr); 1145 ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr); 1146 ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr); 1147 ci[i+1] = ci[i] + packlen; 1148 for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1149 } 1150 ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr); 1151 ierr = PetscFree(seen);CHKERRQ(ierr); 1152 1153 /* Column indices are in the segmented buffer */ 1154 ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr); 1155 ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr); 1156 1157 /* put together the new symbolic matrix */ 1158 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 1159 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1160 1161 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1162 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1163 c = (Mat_SeqAIJ*)(C->data); 1164 c->free_a = PETSC_TRUE; 1165 c->free_ij = PETSC_TRUE; 1166 c->nonew = 0; 1167 1168 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1169 1170 /* set MatInfo */ 1171 afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5; 1172 if (afill < 1.0) afill = 1.0; 1173 c->maxnz = ci[am]; 1174 c->nz = ci[am]; 1175 C->info.mallocs = ndouble; 1176 C->info.fill_ratio_given = fill; 1177 C->info.fill_ratio_needed = afill; 1178 1179 #if defined(PETSC_USE_INFO) 1180 if (ci[am]) { 1181 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 1182 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1183 } else { 1184 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1185 } 1186 #endif 1187 PetscFunctionReturn(0); 1188 } 1189 1190 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1191 { 1192 PetscErrorCode ierr; 1193 Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data; 1194 1195 PetscFunctionBegin; 1196 ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr); 1197 ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr); 1198 ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr); 1199 ierr = PetscFree(abt);CHKERRQ(ierr); 1200 PetscFunctionReturn(0); 1201 } 1202 1203 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1204 { 1205 PetscErrorCode ierr; 1206 Mat Bt; 1207 PetscInt *bti,*btj; 1208 Mat_MatMatTransMult *abt; 1209 Mat_Product *product = C->product; 1210 char *alg; 1211 1212 PetscFunctionBegin; 1213 if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1214 if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 1215 1216 /* create symbolic Bt */ 1217 ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 1218 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr); 1219 ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 1220 ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr); 1221 1222 /* get symbolic C=A*Bt */ 1223 ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr); 1224 ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */ 1225 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr); 1226 ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */ 1227 ierr = PetscFree(alg);CHKERRQ(ierr); 1228 1229 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1230 ierr = PetscNew(&abt);CHKERRQ(ierr); 1231 1232 product->data = abt; 1233 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1234 1235 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1236 1237 abt->usecoloring = PETSC_FALSE; 1238 ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr); 1239 if (abt->usecoloring) { 1240 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1241 MatTransposeColoring matcoloring; 1242 MatColoring coloring; 1243 ISColoring iscoloring; 1244 Mat Bt_dense,C_dense; 1245 1246 /* inode causes memory problem */ 1247 ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr); 1248 1249 ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr); 1250 ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr); 1251 ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr); 1252 ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr); 1253 ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr); 1254 ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr); 1255 ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr); 1256 1257 abt->matcoloring = matcoloring; 1258 1259 ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr); 1260 1261 /* Create Bt_dense and C_dense = A*Bt_dense */ 1262 ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr); 1263 ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr); 1264 ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr); 1265 ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr); 1266 1267 Bt_dense->assembled = PETSC_TRUE; 1268 abt->Bt_den = Bt_dense; 1269 1270 ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr); 1271 ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr); 1272 ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr); 1273 ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr); 1274 1275 Bt_dense->assembled = PETSC_TRUE; 1276 abt->ABt_den = C_dense; 1277 1278 #if defined(PETSC_USE_INFO) 1279 { 1280 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1281 ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr); 1282 } 1283 #endif 1284 } 1285 /* clean up */ 1286 ierr = MatDestroy(&Bt);CHKERRQ(ierr); 1287 ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 1288 PetscFunctionReturn(0); 1289 } 1290 1291 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1292 { 1293 PetscErrorCode ierr; 1294 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1295 PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 1296 PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 1297 PetscLogDouble flops=0.0; 1298 MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 1299 Mat_MatMatTransMult *abt; 1300 Mat_Product *product = C->product; 1301 1302 PetscFunctionBegin; 1303 if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1304 abt = (Mat_MatMatTransMult *)product->data; 1305 if (!abt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1306 /* clear old values in C */ 1307 if (!c->a) { 1308 ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 1309 c->a = ca; 1310 c->free_a = PETSC_TRUE; 1311 } else { 1312 ca = c->a; 1313 ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr); 1314 } 1315 1316 if (abt->usecoloring) { 1317 MatTransposeColoring matcoloring = abt->matcoloring; 1318 Mat Bt_dense,C_dense = abt->ABt_den; 1319 1320 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1321 Bt_dense = abt->Bt_den; 1322 ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr); 1323 1324 /* C_dense = A*Bt_dense */ 1325 ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr); 1326 1327 /* Recover C from C_dense */ 1328 ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr); 1329 PetscFunctionReturn(0); 1330 } 1331 1332 for (i=0; i<cm; i++) { 1333 anzi = ai[i+1] - ai[i]; 1334 acol = aj + ai[i]; 1335 aval = aa + ai[i]; 1336 cnzi = ci[i+1] - ci[i]; 1337 ccol = cj + ci[i]; 1338 cval = ca + ci[i]; 1339 for (j=0; j<cnzi; j++) { 1340 brow = ccol[j]; 1341 bnzj = bi[brow+1] - bi[brow]; 1342 bcol = bj + bi[brow]; 1343 bval = ba + bi[brow]; 1344 1345 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1346 nexta = 0; nextb = 0; 1347 while (nexta<anzi && nextb<bnzj) { 1348 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1349 if (nexta == anzi) break; 1350 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1351 if (nextb == bnzj) break; 1352 if (acol[nexta] == bcol[nextb]) { 1353 cval[j] += aval[nexta]*bval[nextb]; 1354 nexta++; nextb++; 1355 flops += 2; 1356 } 1357 } 1358 } 1359 } 1360 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1361 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1362 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1363 PetscFunctionReturn(0); 1364 } 1365 1366 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1367 { 1368 PetscErrorCode ierr; 1369 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data; 1370 1371 PetscFunctionBegin; 1372 ierr = MatDestroy(&atb->At);CHKERRQ(ierr); 1373 if (atb->destroy) { 1374 ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr); 1375 } 1376 ierr = PetscFree(atb);CHKERRQ(ierr); 1377 PetscFunctionReturn(0); 1378 } 1379 1380 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1381 { 1382 PetscErrorCode ierr; 1383 Mat At = NULL; 1384 PetscInt *ati,*atj; 1385 Mat_Product *product = C->product; 1386 PetscBool flg,def,square; 1387 1388 PetscFunctionBegin; 1389 MatCheckProduct(C,4); 1390 square = (PetscBool)(A == B && A->symmetric && A->symmetric_set); 1391 /* outerproduct */ 1392 ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr); 1393 if (flg) { 1394 /* create symbolic At */ 1395 if (!square) { 1396 ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 1397 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr); 1398 ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 1399 ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr); 1400 } 1401 /* get symbolic C=At*B */ 1402 ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1403 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 1404 1405 /* clean up */ 1406 if (!square) { 1407 ierr = MatDestroy(&At);CHKERRQ(ierr); 1408 ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 1409 } 1410 1411 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1412 ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr); 1413 PetscFunctionReturn(0); 1414 } 1415 1416 /* matmatmult */ 1417 ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr); 1418 ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr); 1419 if (flg || def) { 1420 Mat_MatTransMatMult *atb; 1421 1422 if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 1423 ierr = PetscNew(&atb);CHKERRQ(ierr); 1424 if (!square) { 1425 ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr); 1426 } 1427 ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1428 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 1429 ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr); 1430 product->data = atb; 1431 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1432 atb->At = At; 1433 atb->updateAt = PETSC_FALSE; /* because At is computed here */ 1434 1435 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1436 PetscFunctionReturn(0); 1437 } 1438 1439 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1440 } 1441 1442 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1443 { 1444 PetscErrorCode ierr; 1445 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1446 PetscInt am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1447 PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1448 PetscLogDouble flops=0.0; 1449 MatScalar *aa=a->a,*ba,*ca,*caj; 1450 1451 PetscFunctionBegin; 1452 if (!c->a) { 1453 ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 1454 1455 c->a = ca; 1456 c->free_a = PETSC_TRUE; 1457 } else { 1458 ca = c->a; 1459 ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 1460 } 1461 1462 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1463 for (i=0; i<am; i++) { 1464 bj = b->j + bi[i]; 1465 ba = b->a + bi[i]; 1466 bnzi = bi[i+1] - bi[i]; 1467 anzi = ai[i+1] - ai[i]; 1468 for (j=0; j<anzi; j++) { 1469 nextb = 0; 1470 crow = *aj++; 1471 cjj = cj + ci[crow]; 1472 caj = ca + ci[crow]; 1473 /* perform sparse axpy operation. Note cjj includes bj. */ 1474 for (k=0; nextb<bnzi; k++) { 1475 if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 1476 caj[k] += (*aa)*(*(ba+nextb)); 1477 nextb++; 1478 } 1479 } 1480 flops += 2*bnzi; 1481 aa++; 1482 } 1483 } 1484 1485 /* Assemble the final matrix and clean up */ 1486 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1487 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1488 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1489 PetscFunctionReturn(0); 1490 } 1491 1492 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 1493 { 1494 PetscErrorCode ierr; 1495 1496 PetscFunctionBegin; 1497 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 1498 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1499 PetscFunctionReturn(0); 1500 } 1501 1502 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add) 1503 { 1504 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1505 Mat_SeqDense *bd=(Mat_SeqDense*)B->data; 1506 Mat_SeqDense *cd=(Mat_SeqDense*)C->data; 1507 PetscErrorCode ierr; 1508 PetscScalar *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4; 1509 const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av; 1510 const PetscInt *aj; 1511 PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n; 1512 PetscInt clda=cd->lda; 1513 PetscInt am4=4*clda,bm4=4*bm,col,i,j,n; 1514 1515 PetscFunctionBegin; 1516 if (!cm || !cn) PetscFunctionReturn(0); 1517 ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr); 1518 if (add) { 1519 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1520 } else { 1521 ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr); 1522 } 1523 ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1524 b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 1525 c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda; 1526 for (col=0; col<(cn/4)*4; col += 4) { /* over columns of C */ 1527 for (i=0; i<am; i++) { /* over rows of A in those columns */ 1528 r1 = r2 = r3 = r4 = 0.0; 1529 n = a->i[i+1] - a->i[i]; 1530 aj = a->j + a->i[i]; 1531 aa = av + a->i[i]; 1532 for (j=0; j<n; j++) { 1533 const PetscScalar aatmp = aa[j]; 1534 const PetscInt ajtmp = aj[j]; 1535 r1 += aatmp*b1[ajtmp]; 1536 r2 += aatmp*b2[ajtmp]; 1537 r3 += aatmp*b3[ajtmp]; 1538 r4 += aatmp*b4[ajtmp]; 1539 } 1540 if (add) { 1541 c1[i] += r1; 1542 c2[i] += r2; 1543 c3[i] += r3; 1544 c4[i] += r4; 1545 } else { 1546 c1[i] = r1; 1547 c2[i] = r2; 1548 c3[i] = r3; 1549 c4[i] = r4; 1550 } 1551 } 1552 b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1553 c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1554 } 1555 /* process remaining columns */ 1556 if (col != cn) { 1557 PetscInt rc = cn-col; 1558 1559 if (rc == 1) { 1560 for (i=0; i<am; i++) { 1561 r1 = 0.0; 1562 n = a->i[i+1] - a->i[i]; 1563 aj = a->j + a->i[i]; 1564 aa = av + a->i[i]; 1565 for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]]; 1566 if (add) c1[i] += r1; 1567 else c1[i] = r1; 1568 } 1569 } else if (rc == 2) { 1570 for (i=0; i<am; i++) { 1571 r1 = r2 = 0.0; 1572 n = a->i[i+1] - a->i[i]; 1573 aj = a->j + a->i[i]; 1574 aa = av + a->i[i]; 1575 for (j=0; j<n; j++) { 1576 const PetscScalar aatmp = aa[j]; 1577 const PetscInt ajtmp = aj[j]; 1578 r1 += aatmp*b1[ajtmp]; 1579 r2 += aatmp*b2[ajtmp]; 1580 } 1581 if (add) { 1582 c1[i] += r1; 1583 c2[i] += r2; 1584 } else { 1585 c1[i] = r1; 1586 c2[i] = r2; 1587 } 1588 } 1589 } else { 1590 for (i=0; i<am; i++) { 1591 r1 = r2 = r3 = 0.0; 1592 n = a->i[i+1] - a->i[i]; 1593 aj = a->j + a->i[i]; 1594 aa = av + a->i[i]; 1595 for (j=0; j<n; j++) { 1596 const PetscScalar aatmp = aa[j]; 1597 const PetscInt ajtmp = aj[j]; 1598 r1 += aatmp*b1[ajtmp]; 1599 r2 += aatmp*b2[ajtmp]; 1600 r3 += aatmp*b3[ajtmp]; 1601 } 1602 if (add) { 1603 c1[i] += r1; 1604 c2[i] += r2; 1605 c3[i] += r3; 1606 } else { 1607 c1[i] = r1; 1608 c2[i] = r2; 1609 c3[i] = r3; 1610 } 1611 } 1612 } 1613 } 1614 ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr); 1615 if (add) { 1616 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 1617 } else { 1618 ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr); 1619 } 1620 ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 1621 ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr); 1622 PetscFunctionReturn(0); 1623 } 1624 1625 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1626 { 1627 PetscErrorCode ierr; 1628 1629 PetscFunctionBegin; 1630 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 1631 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 1632 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 1633 1634 ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr); 1635 PetscFunctionReturn(0); 1636 } 1637 1638 /* ------------------------------------------------------- */ 1639 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1640 { 1641 PetscFunctionBegin; 1642 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1643 C->ops->productsymbolic = MatProductSymbolic_AB; 1644 PetscFunctionReturn(0); 1645 } 1646 1647 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat); 1648 1649 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1650 { 1651 PetscFunctionBegin; 1652 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1653 C->ops->productsymbolic = MatProductSymbolic_AtB; 1654 PetscFunctionReturn(0); 1655 } 1656 1657 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1658 { 1659 PetscFunctionBegin; 1660 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1661 C->ops->productsymbolic = MatProductSymbolic_ABt; 1662 PetscFunctionReturn(0); 1663 } 1664 1665 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1666 { 1667 PetscErrorCode ierr; 1668 Mat_Product *product = C->product; 1669 1670 PetscFunctionBegin; 1671 switch (product->type) { 1672 case MATPRODUCT_AB: 1673 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr); 1674 break; 1675 case MATPRODUCT_AtB: 1676 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr); 1677 break; 1678 case MATPRODUCT_ABt: 1679 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr); 1680 break; 1681 default: 1682 break; 1683 } 1684 PetscFunctionReturn(0); 1685 } 1686 /* ------------------------------------------------------- */ 1687 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1688 { 1689 PetscErrorCode ierr; 1690 Mat_Product *product = C->product; 1691 Mat A = product->A; 1692 PetscBool baij; 1693 1694 PetscFunctionBegin; 1695 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr); 1696 if (!baij) { /* A is seqsbaij */ 1697 PetscBool sbaij; 1698 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr); 1699 if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format"); 1700 1701 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1702 } else { /* A is seqbaij */ 1703 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1704 } 1705 1706 C->ops->productsymbolic = MatProductSymbolic_AB; 1707 PetscFunctionReturn(0); 1708 } 1709 1710 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1711 { 1712 PetscErrorCode ierr; 1713 Mat_Product *product = C->product; 1714 1715 PetscFunctionBegin; 1716 MatCheckProduct(C,1); 1717 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1718 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) { 1719 ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr); 1720 } 1721 PetscFunctionReturn(0); 1722 } 1723 1724 /* ------------------------------------------------------- */ 1725 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1726 { 1727 PetscFunctionBegin; 1728 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1729 C->ops->productsymbolic = MatProductSymbolic_AB; 1730 PetscFunctionReturn(0); 1731 } 1732 1733 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1734 { 1735 PetscErrorCode ierr; 1736 Mat_Product *product = C->product; 1737 1738 PetscFunctionBegin; 1739 if (product->type == MATPRODUCT_AB) { 1740 ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr); 1741 } 1742 PetscFunctionReturn(0); 1743 } 1744 /* ------------------------------------------------------- */ 1745 1746 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1747 { 1748 PetscErrorCode ierr; 1749 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1750 Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 1751 PetscInt *bi = b->i,*bj=b->j; 1752 PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 1753 MatScalar *btval,*btval_den,*ba=b->a; 1754 PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1755 1756 PetscFunctionBegin; 1757 btval_den=btdense->v; 1758 ierr = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr); 1759 for (k=0; k<ncolors; k++) { 1760 ncolumns = coloring->ncolumns[k]; 1761 for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1762 col = *(columns + colorforcol[k] + l); 1763 btcol = bj + bi[col]; 1764 btval = ba + bi[col]; 1765 anz = bi[col+1] - bi[col]; 1766 for (j=0; j<anz; j++) { 1767 brow = btcol[j]; 1768 btval_den[brow] = btval[j]; 1769 } 1770 } 1771 btval_den += m; 1772 } 1773 PetscFunctionReturn(0); 1774 } 1775 1776 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 1777 { 1778 PetscErrorCode ierr; 1779 Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 1780 const PetscScalar *ca_den,*ca_den_ptr; 1781 PetscScalar *ca=csp->a; 1782 PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1783 PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1784 PetscInt nrows,*row,*idx; 1785 PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 1786 1787 PetscFunctionBegin; 1788 ierr = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1789 1790 if (brows > 0) { 1791 PetscInt *lstart,row_end,row_start; 1792 lstart = matcoloring->lstart; 1793 ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr); 1794 1795 row_end = brows; 1796 if (row_end > m) row_end = m; 1797 for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1798 ca_den_ptr = ca_den; 1799 for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1800 nrows = matcoloring->nrows[k]; 1801 row = rows + colorforrow[k]; 1802 idx = den2sp + colorforrow[k]; 1803 for (l=lstart[k]; l<nrows; l++) { 1804 if (row[l] >= row_end) { 1805 lstart[k] = l; 1806 break; 1807 } else { 1808 ca[idx[l]] = ca_den_ptr[row[l]]; 1809 } 1810 } 1811 ca_den_ptr += m; 1812 } 1813 row_end += brows; 1814 if (row_end > m) row_end = m; 1815 } 1816 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1817 ca_den_ptr = ca_den; 1818 for (k=0; k<ncolors; k++) { 1819 nrows = matcoloring->nrows[k]; 1820 row = rows + colorforrow[k]; 1821 idx = den2sp + colorforrow[k]; 1822 for (l=0; l<nrows; l++) { 1823 ca[idx[l]] = ca_den_ptr[row[l]]; 1824 } 1825 ca_den_ptr += m; 1826 } 1827 } 1828 1829 ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1830 #if defined(PETSC_USE_INFO) 1831 if (matcoloring->brows > 0) { 1832 ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr); 1833 } else { 1834 ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr); 1835 } 1836 #endif 1837 PetscFunctionReturn(0); 1838 } 1839 1840 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1841 { 1842 PetscErrorCode ierr; 1843 PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 1844 const PetscInt *is,*ci,*cj,*row_idx; 1845 PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1846 IS *isa; 1847 Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1848 PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1849 PetscInt *colorforcol,*columns,*columns_i,brows; 1850 PetscBool flg; 1851 1852 PetscFunctionBegin; 1853 ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr); 1854 1855 /* bs >1 is not being tested yet! */ 1856 Nbs = mat->cmap->N/bs; 1857 c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1858 c->N = Nbs; 1859 c->m = c->M; 1860 c->rstart = 0; 1861 c->brows = 100; 1862 1863 c->ncolors = nis; 1864 ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr); 1865 ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr); 1866 ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr); 1867 1868 brows = c->brows; 1869 ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr); 1870 if (flg) c->brows = brows; 1871 if (brows > 0) { 1872 ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr); 1873 } 1874 1875 colorforrow[0] = 0; 1876 rows_i = rows; 1877 den2sp_i = den2sp; 1878 1879 ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr); 1880 ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr); 1881 1882 colorforcol[0] = 0; 1883 columns_i = columns; 1884 1885 /* get column-wise storage of mat */ 1886 ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1887 1888 cm = c->m; 1889 ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr); 1890 ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr); 1891 for (i=0; i<nis; i++) { /* loop over color */ 1892 ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr); 1893 ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr); 1894 1895 c->ncolumns[i] = n; 1896 if (n) { 1897 ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr); 1898 } 1899 colorforcol[i+1] = colorforcol[i] + n; 1900 columns_i += n; 1901 1902 /* fast, crude version requires O(N*N) work */ 1903 ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr); 1904 1905 for (j=0; j<n; j++) { /* loop over columns*/ 1906 col = is[j]; 1907 row_idx = cj + ci[col]; 1908 m = ci[col+1] - ci[col]; 1909 for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1910 idxhit[*row_idx] = spidx[ci[col] + k]; 1911 rowhit[*row_idx++] = col + 1; 1912 } 1913 } 1914 /* count the number of hits */ 1915 nrows = 0; 1916 for (j=0; j<cm; j++) { 1917 if (rowhit[j]) nrows++; 1918 } 1919 c->nrows[i] = nrows; 1920 colorforrow[i+1] = colorforrow[i] + nrows; 1921 1922 nrows = 0; 1923 for (j=0; j<cm; j++) { /* loop over rows */ 1924 if (rowhit[j]) { 1925 rows_i[nrows] = j; 1926 den2sp_i[nrows] = idxhit[j]; 1927 nrows++; 1928 } 1929 } 1930 den2sp_i += nrows; 1931 1932 ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr); 1933 rows_i += nrows; 1934 } 1935 ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1936 ierr = PetscFree(rowhit);CHKERRQ(ierr); 1937 ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr); 1938 if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]); 1939 1940 c->colorforrow = colorforrow; 1941 c->rows = rows; 1942 c->den2sp = den2sp; 1943 c->colorforcol = colorforcol; 1944 c->columns = columns; 1945 1946 ierr = PetscFree(idxhit);CHKERRQ(ierr); 1947 PetscFunctionReturn(0); 1948 } 1949 1950 /* --------------------------------------------------------------- */ 1951 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1952 { 1953 PetscErrorCode ierr; 1954 Mat_Product *product = C->product; 1955 Mat A=product->A,B=product->B; 1956 1957 PetscFunctionBegin; 1958 if (C->ops->mattransposemultnumeric) { 1959 /* Alg: "outerproduct" */ 1960 ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr); 1961 } else { 1962 /* Alg: "matmatmult" -- C = At*B */ 1963 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1964 Mat At; 1965 1966 if (!atb) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1967 At = atb->At; 1968 if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */ 1969 ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr); 1970 } 1971 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr); 1972 atb->updateAt = PETSC_TRUE; 1973 } 1974 PetscFunctionReturn(0); 1975 } 1976 1977 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1978 { 1979 PetscErrorCode ierr; 1980 Mat_Product *product = C->product; 1981 Mat A=product->A,B=product->B; 1982 PetscReal fill=product->fill; 1983 1984 PetscFunctionBegin; 1985 ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 1986 1987 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1988 PetscFunctionReturn(0); 1989 } 1990 1991 /* --------------------------------------------------------------- */ 1992 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1993 { 1994 PetscErrorCode ierr; 1995 Mat_Product *product = C->product; 1996 PetscInt alg = 0; /* default algorithm */ 1997 PetscBool flg = PETSC_FALSE; 1998 #if !defined(PETSC_HAVE_HYPRE) 1999 const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 2000 PetscInt nalg = 7; 2001 #else 2002 const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"}; 2003 PetscInt nalg = 8; 2004 #endif 2005 2006 PetscFunctionBegin; 2007 /* Set default algorithm */ 2008 ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 2009 if (flg) { 2010 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2011 } 2012 2013 /* Get runtime option */ 2014 if (product->api_user) { 2015 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2016 ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2017 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2018 } else { 2019 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2020 ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2021 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2022 } 2023 if (flg) { 2024 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2025 } 2026 2027 C->ops->productsymbolic = MatProductSymbolic_AB; 2028 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2029 PetscFunctionReturn(0); 2030 } 2031 2032 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2033 { 2034 PetscErrorCode ierr; 2035 Mat_Product *product = C->product; 2036 PetscInt alg = 0; /* default algorithm */ 2037 PetscBool flg = PETSC_FALSE; 2038 const char *algTypes[3] = {"default","at*b","outerproduct"}; 2039 PetscInt nalg = 3; 2040 2041 PetscFunctionBegin; 2042 /* Get runtime option */ 2043 if (product->api_user) { 2044 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2045 ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2046 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2047 } else { 2048 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2049 ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2050 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2051 } 2052 if (flg) { 2053 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2054 } 2055 2056 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2057 PetscFunctionReturn(0); 2058 } 2059 2060 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2061 { 2062 PetscErrorCode ierr; 2063 Mat_Product *product = C->product; 2064 PetscInt alg = 0; /* default algorithm */ 2065 PetscBool flg = PETSC_FALSE; 2066 const char *algTypes[2] = {"default","color"}; 2067 PetscInt nalg = 2; 2068 2069 PetscFunctionBegin; 2070 /* Set default algorithm */ 2071 ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 2072 if (!flg) { 2073 alg = 1; 2074 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2075 } 2076 2077 /* Get runtime option */ 2078 if (product->api_user) { 2079 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr); 2080 ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2081 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2082 } else { 2083 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr); 2084 ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2085 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2086 } 2087 if (flg) { 2088 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2089 } 2090 2091 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2092 C->ops->productsymbolic = MatProductSymbolic_ABt; 2093 PetscFunctionReturn(0); 2094 } 2095 2096 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2097 { 2098 PetscErrorCode ierr; 2099 Mat_Product *product = C->product; 2100 PetscBool flg = PETSC_FALSE; 2101 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2102 #if !defined(PETSC_HAVE_HYPRE) 2103 const char *algTypes[2] = {"scalable","rap"}; 2104 PetscInt nalg = 2; 2105 #else 2106 const char *algTypes[3] = {"scalable","rap","hypre"}; 2107 PetscInt nalg = 3; 2108 #endif 2109 2110 PetscFunctionBegin; 2111 /* Set default algorithm */ 2112 ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 2113 if (flg) { 2114 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2115 } 2116 2117 /* Get runtime option */ 2118 if (product->api_user) { 2119 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2120 ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2121 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2122 } else { 2123 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2124 ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2125 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2126 } 2127 if (flg) { 2128 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2129 } 2130 2131 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2132 PetscFunctionReturn(0); 2133 } 2134 2135 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2136 { 2137 PetscErrorCode ierr; 2138 Mat_Product *product = C->product; 2139 PetscBool flg = PETSC_FALSE; 2140 PetscInt alg = 0; /* default algorithm */ 2141 const char *algTypes[3] = {"r*a*rt","r*art","coloring_rart"}; 2142 PetscInt nalg = 3; 2143 2144 PetscFunctionBegin; 2145 /* Set default algorithm */ 2146 ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 2147 if (flg) { 2148 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2149 } 2150 2151 /* Get runtime option */ 2152 if (product->api_user) { 2153 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2154 ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2155 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2156 } else { 2157 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2158 ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2159 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2160 } 2161 if (flg) { 2162 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2163 } 2164 2165 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2166 PetscFunctionReturn(0); 2167 } 2168 2169 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2170 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2171 { 2172 PetscErrorCode ierr; 2173 Mat_Product *product = C->product; 2174 PetscInt alg = 0; /* default algorithm */ 2175 PetscBool flg = PETSC_FALSE; 2176 const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 2177 PetscInt nalg = 7; 2178 2179 PetscFunctionBegin; 2180 /* Set default algorithm */ 2181 ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 2182 if (flg) { 2183 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2184 } 2185 2186 /* Get runtime option */ 2187 if (product->api_user) { 2188 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2189 ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2190 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2191 } else { 2192 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2193 ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2194 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2195 } 2196 if (flg) { 2197 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2198 } 2199 2200 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2201 C->ops->productsymbolic = MatProductSymbolic_ABC; 2202 PetscFunctionReturn(0); 2203 } 2204 2205 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2206 { 2207 PetscErrorCode ierr; 2208 Mat_Product *product = C->product; 2209 2210 PetscFunctionBegin; 2211 switch (product->type) { 2212 case MATPRODUCT_AB: 2213 ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr); 2214 break; 2215 case MATPRODUCT_AtB: 2216 ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr); 2217 break; 2218 case MATPRODUCT_ABt: 2219 ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr); 2220 break; 2221 case MATPRODUCT_PtAP: 2222 ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr); 2223 break; 2224 case MATPRODUCT_RARt: 2225 ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr); 2226 break; 2227 case MATPRODUCT_ABC: 2228 ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr); 2229 break; 2230 default: 2231 break; 2232 } 2233 PetscFunctionReturn(0); 2234 } 2235