1 2 /* 3 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4 C = A * B 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <petscbt.h> 10 #include <petsc/private/isimpl.h> 11 #include <../src/mat/impls/dense/seq/dense.h> 12 13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 14 { 15 PetscErrorCode ierr; 16 17 PetscFunctionBegin; 18 if (C->ops->matmultnumeric) { 19 if (C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call"); 20 ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr); 21 } else { 22 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr); 23 } 24 PetscFunctionReturn(0); 25 } 26 27 /* Modified from MatCreateSeqAIJWithArrays() */ 28 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat) 29 { 30 PetscErrorCode ierr; 31 PetscInt ii; 32 Mat_SeqAIJ *aij; 33 PetscBool isseqaij; 34 35 PetscFunctionBegin; 36 if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0"); 37 ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr); 38 39 if (!mtype) { 40 ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 41 if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); } 42 } else { 43 ierr = MatSetType(mat,mtype);CHKERRQ(ierr); 44 } 45 ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 46 aij = (Mat_SeqAIJ*)(mat)->data; 47 ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr); 48 ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr); 49 50 aij->i = i; 51 aij->j = j; 52 aij->a = a; 53 aij->singlemalloc = PETSC_FALSE; 54 aij->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 55 aij->free_a = PETSC_FALSE; 56 aij->free_ij = PETSC_FALSE; 57 58 for (ii=0; ii<m; ii++) { 59 aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii]; 60 } 61 62 PetscFunctionReturn(0); 63 } 64 65 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 66 { 67 PetscErrorCode ierr; 68 Mat_Product *product = C->product; 69 MatProductAlgorithm alg; 70 PetscBool flg; 71 72 PetscFunctionBegin; 73 if (product) { 74 alg = product->alg; 75 } else { 76 alg = "sorted"; 77 } 78 /* sorted */ 79 ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr); 80 if (flg) { 81 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr); 82 PetscFunctionReturn(0); 83 } 84 85 /* scalable */ 86 ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr); 87 if (flg) { 88 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr); 89 PetscFunctionReturn(0); 90 } 91 92 /* scalable_fast */ 93 ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr); 94 if (flg) { 95 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr); 96 PetscFunctionReturn(0); 97 } 98 99 /* heap */ 100 ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr); 101 if (flg) { 102 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr); 103 PetscFunctionReturn(0); 104 } 105 106 /* btheap */ 107 ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr); 108 if (flg) { 109 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 113 /* llcondensed */ 114 ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr); 115 if (flg) { 116 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr); 117 PetscFunctionReturn(0); 118 } 119 120 /* rowmerge */ 121 ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr); 122 if (flg) { 123 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr); 124 PetscFunctionReturn(0); 125 } 126 127 #if defined(PETSC_HAVE_HYPRE) 128 ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr); 129 if (flg) { 130 ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr); 131 PetscFunctionReturn(0); 132 } 133 #endif 134 135 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 136 } 137 138 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C) 139 { 140 PetscErrorCode ierr; 141 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 142 PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 143 PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 144 PetscReal afill; 145 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 146 PetscTable ta; 147 PetscBT lnkbt; 148 PetscFreeSpaceList free_space=NULL,current_space=NULL; 149 150 PetscFunctionBegin; 151 /* Get ci and cj */ 152 /*---------------*/ 153 /* Allocate ci array, arrays for fill computation and */ 154 /* free space for accumulating nonzero column info */ 155 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 156 ci[0] = 0; 157 158 /* create and initialize a linked list */ 159 ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 160 MatRowMergeMax_SeqAIJ(b,bm,ta); 161 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 162 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 163 164 ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr); 165 166 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 167 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 168 169 current_space = free_space; 170 171 /* Determine ci and cj */ 172 for (i=0; i<am; i++) { 173 anzi = ai[i+1] - ai[i]; 174 aj = a->j + ai[i]; 175 for (j=0; j<anzi; j++) { 176 brow = aj[j]; 177 bnzj = bi[brow+1] - bi[brow]; 178 bj = b->j + bi[brow]; 179 /* add non-zero cols of B into the sorted linked list lnk */ 180 ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr); 181 } 182 /* add possible missing diagonal entry */ 183 if (C->force_diagonals) { 184 ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr); 185 } 186 cnzi = lnk[0]; 187 188 /* If free space is not available, make more free space */ 189 /* Double the amount of total space in the list */ 190 if (current_space->local_remaining<cnzi) { 191 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 192 ndouble++; 193 } 194 195 /* Copy data into free space, then initialize lnk */ 196 ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 197 198 current_space->array += cnzi; 199 current_space->local_used += cnzi; 200 current_space->local_remaining -= cnzi; 201 202 ci[i+1] = ci[i] + cnzi; 203 } 204 205 /* Column indices are in the list of free space */ 206 /* Allocate space for cj, initialize cj, and */ 207 /* destroy list of free space and other temporary array(s) */ 208 ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 209 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 210 ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 211 212 /* put together the new symbolic matrix */ 213 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 214 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 215 216 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 217 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 218 c = (Mat_SeqAIJ*)(C->data); 219 c->free_a = PETSC_FALSE; 220 c->free_ij = PETSC_TRUE; 221 c->nonew = 0; 222 223 /* fast, needs non-scalable O(bn) array 'abdense' */ 224 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 225 226 /* set MatInfo */ 227 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 228 if (afill < 1.0) afill = 1.0; 229 c->maxnz = ci[am]; 230 c->nz = ci[am]; 231 C->info.mallocs = ndouble; 232 C->info.fill_ratio_given = fill; 233 C->info.fill_ratio_needed = afill; 234 235 #if defined(PETSC_USE_INFO) 236 if (ci[am]) { 237 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 238 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 239 } else { 240 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 241 } 242 #endif 243 PetscFunctionReturn(0); 244 } 245 246 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C) 247 { 248 PetscErrorCode ierr; 249 PetscLogDouble flops=0.0; 250 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 251 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 252 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 253 PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 254 PetscInt am =A->rmap->n,cm=C->rmap->n; 255 PetscInt i,j,k,anzi,bnzi,cnzi,brow; 256 PetscScalar *ca,valtmp; 257 PetscScalar *ab_dense; 258 PetscContainer cab_dense; 259 const PetscScalar *aa,*ba,*baj; 260 261 PetscFunctionBegin; 262 ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 263 ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 264 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 265 ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 266 c->a = ca; 267 c->free_a = PETSC_TRUE; 268 } else ca = c->a; 269 270 /* TODO this should be done in the symbolic phase */ 271 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 272 that is hard to eradicate) */ 273 ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr); 274 if (!cab_dense) { 275 ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr); 276 ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr); 277 ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr); 278 ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr); 279 ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr); 280 ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr); 281 } 282 ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr); 283 ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr); 284 285 /* clean old values in C */ 286 ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 287 /* Traverse A row-wise. */ 288 /* Build the ith row in C by summing over nonzero columns in A, */ 289 /* the rows of B corresponding to nonzeros of A. */ 290 for (i=0; i<am; i++) { 291 anzi = ai[i+1] - ai[i]; 292 for (j=0; j<anzi; j++) { 293 brow = aj[j]; 294 bnzi = bi[brow+1] - bi[brow]; 295 bjj = bj + bi[brow]; 296 baj = ba + bi[brow]; 297 /* perform dense axpy */ 298 valtmp = aa[j]; 299 for (k=0; k<bnzi; k++) { 300 ab_dense[bjj[k]] += valtmp*baj[k]; 301 } 302 flops += 2*bnzi; 303 } 304 aj += anzi; aa += anzi; 305 306 cnzi = ci[i+1] - ci[i]; 307 for (k=0; k<cnzi; k++) { 308 ca[k] += ab_dense[cj[k]]; 309 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 310 } 311 flops += cnzi; 312 cj += cnzi; ca += cnzi; 313 } 314 #if defined(PETSC_HAVE_DEVICE) 315 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 316 #endif 317 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 318 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 319 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 320 ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 321 ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 326 { 327 PetscErrorCode ierr; 328 PetscLogDouble flops=0.0; 329 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 330 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 331 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 332 PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 333 PetscInt am = A->rmap->N,cm=C->rmap->N; 334 PetscInt i,j,k,anzi,bnzi,cnzi,brow; 335 PetscScalar *ca=c->a,valtmp; 336 const PetscScalar *aa,*ba,*baj; 337 PetscInt nextb; 338 339 PetscFunctionBegin; 340 ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 341 ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 342 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 343 ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 344 c->a = ca; 345 c->free_a = PETSC_TRUE; 346 } 347 348 /* clean old values in C */ 349 ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 350 /* Traverse A row-wise. */ 351 /* Build the ith row in C by summing over nonzero columns in A, */ 352 /* the rows of B corresponding to nonzeros of A. */ 353 for (i=0; i<am; i++) { 354 anzi = ai[i+1] - ai[i]; 355 cnzi = ci[i+1] - ci[i]; 356 for (j=0; j<anzi; j++) { 357 brow = aj[j]; 358 bnzi = bi[brow+1] - bi[brow]; 359 bjj = bj + bi[brow]; 360 baj = ba + bi[brow]; 361 /* perform sparse axpy */ 362 valtmp = aa[j]; 363 nextb = 0; 364 for (k=0; nextb<bnzi; k++) { 365 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 366 ca[k] += valtmp*baj[nextb++]; 367 } 368 } 369 flops += 2*bnzi; 370 } 371 aj += anzi; aa += anzi; 372 cj += cnzi; ca += cnzi; 373 } 374 #if defined(PETSC_HAVE_DEVICE) 375 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 376 #endif 377 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 378 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 379 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 380 ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 381 ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C) 386 { 387 PetscErrorCode ierr; 388 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 389 PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 390 PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 391 MatScalar *ca; 392 PetscReal afill; 393 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 394 PetscTable ta; 395 PetscFreeSpaceList free_space=NULL,current_space=NULL; 396 397 PetscFunctionBegin; 398 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 399 /*-----------------------------------------------------------------------------------------*/ 400 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 401 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 402 ci[0] = 0; 403 404 /* create and initialize a linked list */ 405 ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 406 MatRowMergeMax_SeqAIJ(b,bm,ta); 407 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 408 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 409 410 ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr); 411 412 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 413 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 414 current_space = free_space; 415 416 /* Determine ci and cj */ 417 for (i=0; i<am; i++) { 418 anzi = ai[i+1] - ai[i]; 419 aj = a->j + ai[i]; 420 for (j=0; j<anzi; j++) { 421 brow = aj[j]; 422 bnzj = bi[brow+1] - bi[brow]; 423 bj = b->j + bi[brow]; 424 /* add non-zero cols of B into the sorted linked list lnk */ 425 ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr); 426 } 427 /* add possible missing diagonal entry */ 428 if (C->force_diagonals) { 429 ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr); 430 } 431 cnzi = lnk[1]; 432 433 /* If free space is not available, make more free space */ 434 /* Double the amount of total space in the list */ 435 if (current_space->local_remaining<cnzi) { 436 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 437 ndouble++; 438 } 439 440 /* Copy data into free space, then initialize lnk */ 441 ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr); 442 443 current_space->array += cnzi; 444 current_space->local_used += cnzi; 445 current_space->local_remaining -= cnzi; 446 447 ci[i+1] = ci[i] + cnzi; 448 } 449 450 /* Column indices are in the list of free space */ 451 /* Allocate space for cj, initialize cj, and */ 452 /* destroy list of free space and other temporary array(s) */ 453 ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 454 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 455 ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr); 456 457 /* Allocate space for ca */ 458 ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 459 460 /* put together the new symbolic matrix */ 461 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 462 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 463 464 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 465 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 466 c = (Mat_SeqAIJ*)(C->data); 467 c->free_a = PETSC_TRUE; 468 c->free_ij = PETSC_TRUE; 469 c->nonew = 0; 470 471 /* slower, less memory */ 472 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 473 474 /* set MatInfo */ 475 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 476 if (afill < 1.0) afill = 1.0; 477 c->maxnz = ci[am]; 478 c->nz = ci[am]; 479 C->info.mallocs = ndouble; 480 C->info.fill_ratio_given = fill; 481 C->info.fill_ratio_needed = afill; 482 483 #if defined(PETSC_USE_INFO) 484 if (ci[am]) { 485 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 486 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 487 } else { 488 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 489 } 490 #endif 491 PetscFunctionReturn(0); 492 } 493 494 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C) 495 { 496 PetscErrorCode ierr; 497 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 498 PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 499 PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 500 MatScalar *ca; 501 PetscReal afill; 502 PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 503 PetscTable ta; 504 PetscFreeSpaceList free_space=NULL,current_space=NULL; 505 506 PetscFunctionBegin; 507 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 508 /*---------------------------------------------------------------------------------------------*/ 509 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 510 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 511 ci[0] = 0; 512 513 /* create and initialize a linked list */ 514 ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 515 MatRowMergeMax_SeqAIJ(b,bm,ta); 516 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 517 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 518 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 519 520 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 521 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 522 current_space = free_space; 523 524 /* Determine ci and cj */ 525 for (i=0; i<am; i++) { 526 anzi = ai[i+1] - ai[i]; 527 aj = a->j + ai[i]; 528 for (j=0; j<anzi; j++) { 529 brow = aj[j]; 530 bnzj = bi[brow+1] - bi[brow]; 531 bj = b->j + bi[brow]; 532 /* add non-zero cols of B into the sorted linked list lnk */ 533 ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr); 534 } 535 /* add possible missing diagonal entry */ 536 if (C->force_diagonals) { 537 ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr); 538 } 539 540 cnzi = lnk[0]; 541 542 /* If free space is not available, make more free space */ 543 /* Double the amount of total space in the list */ 544 if (current_space->local_remaining<cnzi) { 545 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 546 ndouble++; 547 } 548 549 /* Copy data into free space, then initialize lnk */ 550 ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr); 551 552 current_space->array += cnzi; 553 current_space->local_used += cnzi; 554 current_space->local_remaining -= cnzi; 555 556 ci[i+1] = ci[i] + cnzi; 557 } 558 559 /* Column indices are in the list of free space */ 560 /* Allocate space for cj, initialize cj, and */ 561 /* destroy list of free space and other temporary array(s) */ 562 ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 563 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 564 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 565 566 /* Allocate space for ca */ 567 /*-----------------------*/ 568 ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 569 570 /* put together the new symbolic matrix */ 571 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 572 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 573 574 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 575 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 576 c = (Mat_SeqAIJ*)(C->data); 577 c->free_a = PETSC_TRUE; 578 c->free_ij = PETSC_TRUE; 579 c->nonew = 0; 580 581 /* slower, less memory */ 582 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 583 584 /* set MatInfo */ 585 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 586 if (afill < 1.0) afill = 1.0; 587 c->maxnz = ci[am]; 588 c->nz = ci[am]; 589 C->info.mallocs = ndouble; 590 C->info.fill_ratio_given = fill; 591 C->info.fill_ratio_needed = afill; 592 593 #if defined(PETSC_USE_INFO) 594 if (ci[am]) { 595 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 596 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 597 } else { 598 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 599 } 600 #endif 601 PetscFunctionReturn(0); 602 } 603 604 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C) 605 { 606 PetscErrorCode ierr; 607 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 608 const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 609 PetscInt *ci,*cj,*bb; 610 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 611 PetscReal afill; 612 PetscInt i,j,col,ndouble = 0; 613 PetscFreeSpaceList free_space=NULL,current_space=NULL; 614 PetscHeap h; 615 616 PetscFunctionBegin; 617 /* Get ci and cj - by merging sorted rows using a heap */ 618 /*---------------------------------------------------------------------------------------------*/ 619 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 620 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 621 ci[0] = 0; 622 623 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 624 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 625 current_space = free_space; 626 627 ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 628 ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 629 630 /* Determine ci and cj */ 631 for (i=0; i<am; i++) { 632 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 633 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 634 ci[i+1] = ci[i]; 635 /* Populate the min heap */ 636 for (j=0; j<anzi; j++) { 637 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 638 if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 639 ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr); 640 } 641 } 642 /* Pick off the min element, adding it to free space */ 643 ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 644 while (j >= 0) { 645 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 646 ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 647 ndouble++; 648 } 649 *(current_space->array++) = col; 650 current_space->local_used++; 651 current_space->local_remaining--; 652 ci[i+1]++; 653 654 /* stash if anything else remains in this row of B */ 655 if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);} 656 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 657 PetscInt j2,col2; 658 ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr); 659 if (col2 != col) break; 660 ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr); 661 if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);} 662 } 663 /* Put any stashed elements back into the min heap */ 664 ierr = PetscHeapUnstash(h);CHKERRQ(ierr); 665 ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 666 } 667 } 668 ierr = PetscFree(bb);CHKERRQ(ierr); 669 ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 670 671 /* Column indices are in the list of free space */ 672 /* Allocate space for cj, initialize cj, and */ 673 /* destroy list of free space and other temporary array(s) */ 674 ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 675 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 676 677 /* put together the new symbolic matrix */ 678 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 679 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 680 681 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 682 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 683 c = (Mat_SeqAIJ*)(C->data); 684 c->free_a = PETSC_TRUE; 685 c->free_ij = PETSC_TRUE; 686 c->nonew = 0; 687 688 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 689 690 /* set MatInfo */ 691 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 692 if (afill < 1.0) afill = 1.0; 693 c->maxnz = ci[am]; 694 c->nz = ci[am]; 695 C->info.mallocs = ndouble; 696 C->info.fill_ratio_given = fill; 697 C->info.fill_ratio_needed = afill; 698 699 #if defined(PETSC_USE_INFO) 700 if (ci[am]) { 701 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 702 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 703 } else { 704 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 705 } 706 #endif 707 PetscFunctionReturn(0); 708 } 709 710 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C) 711 { 712 PetscErrorCode ierr; 713 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 714 const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 715 PetscInt *ci,*cj,*bb; 716 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 717 PetscReal afill; 718 PetscInt i,j,col,ndouble = 0; 719 PetscFreeSpaceList free_space=NULL,current_space=NULL; 720 PetscHeap h; 721 PetscBT bt; 722 723 PetscFunctionBegin; 724 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 725 /*---------------------------------------------------------------------------------------------*/ 726 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 727 ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 728 ci[0] = 0; 729 730 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 731 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 732 733 current_space = free_space; 734 735 ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 736 ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 737 ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr); 738 739 /* Determine ci and cj */ 740 for (i=0; i<am; i++) { 741 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 742 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 743 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 744 ci[i+1] = ci[i]; 745 /* Populate the min heap */ 746 for (j=0; j<anzi; j++) { 747 PetscInt brow = acol[j]; 748 for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 749 PetscInt bcol = bj[bb[j]]; 750 if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 751 ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 752 bb[j]++; 753 break; 754 } 755 } 756 } 757 /* Pick off the min element, adding it to free space */ 758 ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 759 while (j >= 0) { 760 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 761 fptr = NULL; /* need PetscBTMemzero */ 762 ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 763 ndouble++; 764 } 765 *(current_space->array++) = col; 766 current_space->local_used++; 767 current_space->local_remaining--; 768 ci[i+1]++; 769 770 /* stash if anything else remains in this row of B */ 771 for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 772 PetscInt bcol = bj[bb[j]]; 773 if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 774 ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 775 bb[j]++; 776 break; 777 } 778 } 779 ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 780 } 781 if (fptr) { /* Clear the bits for this row */ 782 for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);} 783 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 784 ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr); 785 } 786 } 787 ierr = PetscFree(bb);CHKERRQ(ierr); 788 ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 789 ierr = PetscBTDestroy(&bt);CHKERRQ(ierr); 790 791 /* Column indices are in the list of free space */ 792 /* Allocate space for cj, initialize cj, and */ 793 /* destroy list of free space and other temporary array(s) */ 794 ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 795 ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 796 797 /* put together the new symbolic matrix */ 798 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 799 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 800 801 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 802 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 803 c = (Mat_SeqAIJ*)(C->data); 804 c->free_a = PETSC_TRUE; 805 c->free_ij = PETSC_TRUE; 806 c->nonew = 0; 807 808 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 809 810 /* set MatInfo */ 811 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 812 if (afill < 1.0) afill = 1.0; 813 c->maxnz = ci[am]; 814 c->nz = ci[am]; 815 C->info.mallocs = ndouble; 816 C->info.fill_ratio_given = fill; 817 C->info.fill_ratio_needed = afill; 818 819 #if defined(PETSC_USE_INFO) 820 if (ci[am]) { 821 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 822 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 823 } else { 824 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 825 } 826 #endif 827 PetscFunctionReturn(0); 828 } 829 830 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C) 831 { 832 PetscErrorCode ierr; 833 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 834 const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 835 PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 836 PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 837 const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 838 const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 839 const PetscInt *brow_ptr[8],*brow_end[8]; 840 PetscInt window[8]; 841 PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 842 PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 843 PetscReal afill; 844 PetscInt *workj_L1,*workj_L2,*workj_L3; 845 PetscInt L1_nnz,L2_nnz; 846 847 /* Step 1: Get upper bound on memory required for allocation. 848 Because of the way virtual memory works, 849 only the memory pages that are actually needed will be physically allocated. */ 850 PetscFunctionBegin; 851 ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 852 for (i=0; i<am; i++) { 853 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 854 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 855 a_rownnz = 0; 856 for (k=0; k<anzi; ++k) { 857 a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 858 if (a_rownnz > bn) { 859 a_rownnz = bn; 860 break; 861 } 862 } 863 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 864 } 865 /* temporary work areas for merging rows */ 866 ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr); 867 ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr); 868 ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr); 869 870 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 871 c_maxmem = 8*(ai[am]+bi[bm]); 872 /* Step 2: Populate pattern for C */ 873 ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 874 875 ci_nnz = 0; 876 ci[0] = 0; 877 worki_L1[0] = 0; 878 worki_L2[0] = 0; 879 for (i=0; i<am; i++) { 880 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 881 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 882 rowsleft = anzi; 883 inputcol_L1 = acol; 884 L2_nnz = 0; 885 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 886 worki_L2[1] = 0; 887 outputi_nnz = 0; 888 889 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 890 while (ci_nnz+a_maxrownnz > c_maxmem) { 891 c_maxmem *= 2; 892 ndouble++; 893 ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 894 } 895 896 while (rowsleft) { 897 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 898 L1_nrows = 0; 899 L1_nnz = 0; 900 inputcol = inputcol_L1; 901 inputi = bi; 902 inputj = bj; 903 904 /* The following macro is used to specialize for small rows in A. 905 This helps with compiler unrolling, improving performance substantially. 906 Input: inputj inputi inputcol bn 907 Output: outputj outputi_nnz */ 908 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 909 window_min = bn; \ 910 outputi_nnz = 0; \ 911 for (k=0; k<ANNZ; ++k) { \ 912 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 913 brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 914 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 915 window_min = PetscMin(window[k], window_min); \ 916 } \ 917 while (window_min < bn) { \ 918 outputj[outputi_nnz++] = window_min; \ 919 /* advance front and compute new minimum */ \ 920 old_window_min = window_min; \ 921 window_min = bn; \ 922 for (k=0; k<ANNZ; ++k) { \ 923 if (window[k] == old_window_min) { \ 924 brow_ptr[k]++; \ 925 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 926 } \ 927 window_min = PetscMin(window[k], window_min); \ 928 } \ 929 } 930 931 /************** L E V E L 1 ***************/ 932 /* Merge up to 8 rows of B to L1 work array*/ 933 while (L1_rowsleft) { 934 outputi_nnz = 0; 935 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 936 else outputj = cj + ci_nnz; /* Merge directly to C */ 937 938 switch (L1_rowsleft) { 939 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 940 brow_end[0] = inputj + inputi[inputcol[0]+1]; 941 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 942 inputcol += L1_rowsleft; 943 rowsleft -= L1_rowsleft; 944 L1_rowsleft = 0; 945 break; 946 case 2: MatMatMultSymbolic_RowMergeMacro(2); 947 inputcol += L1_rowsleft; 948 rowsleft -= L1_rowsleft; 949 L1_rowsleft = 0; 950 break; 951 case 3: MatMatMultSymbolic_RowMergeMacro(3); 952 inputcol += L1_rowsleft; 953 rowsleft -= L1_rowsleft; 954 L1_rowsleft = 0; 955 break; 956 case 4: MatMatMultSymbolic_RowMergeMacro(4); 957 inputcol += L1_rowsleft; 958 rowsleft -= L1_rowsleft; 959 L1_rowsleft = 0; 960 break; 961 case 5: MatMatMultSymbolic_RowMergeMacro(5); 962 inputcol += L1_rowsleft; 963 rowsleft -= L1_rowsleft; 964 L1_rowsleft = 0; 965 break; 966 case 6: MatMatMultSymbolic_RowMergeMacro(6); 967 inputcol += L1_rowsleft; 968 rowsleft -= L1_rowsleft; 969 L1_rowsleft = 0; 970 break; 971 case 7: MatMatMultSymbolic_RowMergeMacro(7); 972 inputcol += L1_rowsleft; 973 rowsleft -= L1_rowsleft; 974 L1_rowsleft = 0; 975 break; 976 default: MatMatMultSymbolic_RowMergeMacro(8); 977 inputcol += 8; 978 rowsleft -= 8; 979 L1_rowsleft -= 8; 980 break; 981 } 982 inputcol_L1 = inputcol; 983 L1_nnz += outputi_nnz; 984 worki_L1[++L1_nrows] = L1_nnz; 985 } 986 987 /********************** L E V E L 2 ************************/ 988 /* Merge from L1 work array to either C or to L2 work array */ 989 if (anzi > 8) { 990 inputi = worki_L1; 991 inputj = workj_L1; 992 inputcol = workcol; 993 outputi_nnz = 0; 994 995 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 996 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 997 998 switch (L1_nrows) { 999 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1000 brow_end[0] = inputj + inputi[inputcol[0]+1]; 1001 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1002 break; 1003 case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1004 case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1005 case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1006 case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1007 case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1008 case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1009 case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1010 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1011 } 1012 L2_nnz += outputi_nnz; 1013 worki_L2[++L2_nrows] = L2_nnz; 1014 1015 /************************ L E V E L 3 **********************/ 1016 /* Merge from L2 work array to either C or to L2 work array */ 1017 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1018 inputi = worki_L2; 1019 inputj = workj_L2; 1020 inputcol = workcol; 1021 outputi_nnz = 0; 1022 if (rowsleft) outputj = workj_L3; 1023 else outputj = cj + ci_nnz; 1024 switch (L2_nrows) { 1025 case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1026 brow_end[0] = inputj + inputi[inputcol[0]+1]; 1027 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1028 break; 1029 case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1030 case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1031 case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1032 case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1033 case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1034 case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1035 case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1036 default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1037 } 1038 L2_nrows = 1; 1039 L2_nnz = outputi_nnz; 1040 worki_L2[1] = outputi_nnz; 1041 /* Copy to workj_L2 */ 1042 if (rowsleft) { 1043 for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1044 } 1045 } 1046 } 1047 } /* while (rowsleft) */ 1048 #undef MatMatMultSymbolic_RowMergeMacro 1049 1050 /* terminate current row */ 1051 ci_nnz += outputi_nnz; 1052 ci[i+1] = ci_nnz; 1053 } 1054 1055 /* Step 3: Create the new symbolic matrix */ 1056 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 1057 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1058 1059 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1060 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1061 c = (Mat_SeqAIJ*)(C->data); 1062 c->free_a = PETSC_TRUE; 1063 c->free_ij = PETSC_TRUE; 1064 c->nonew = 0; 1065 1066 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1067 1068 /* set MatInfo */ 1069 afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1070 if (afill < 1.0) afill = 1.0; 1071 c->maxnz = ci[am]; 1072 c->nz = ci[am]; 1073 C->info.mallocs = ndouble; 1074 C->info.fill_ratio_given = fill; 1075 C->info.fill_ratio_needed = afill; 1076 1077 #if defined(PETSC_USE_INFO) 1078 if (ci[am]) { 1079 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 1080 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1081 } else { 1082 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1083 } 1084 #endif 1085 1086 /* Step 4: Free temporary work areas */ 1087 ierr = PetscFree(workj_L1);CHKERRQ(ierr); 1088 ierr = PetscFree(workj_L2);CHKERRQ(ierr); 1089 ierr = PetscFree(workj_L3);CHKERRQ(ierr); 1090 PetscFunctionReturn(0); 1091 } 1092 1093 /* concatenate unique entries and then sort */ 1094 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C) 1095 { 1096 PetscErrorCode ierr; 1097 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1098 const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 1099 PetscInt *ci,*cj,bcol; 1100 PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1101 PetscReal afill; 1102 PetscInt i,j,ndouble = 0; 1103 PetscSegBuffer seg,segrow; 1104 char *seen; 1105 1106 PetscFunctionBegin; 1107 ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1108 ci[0] = 0; 1109 1110 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1111 ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr); 1112 ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr); 1113 ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr); 1114 1115 /* Determine ci and cj */ 1116 for (i=0; i<am; i++) { 1117 const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1118 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1119 PetscInt packlen = 0,*PETSC_RESTRICT crow; 1120 1121 /* Pack segrow */ 1122 for (j=0; j<anzi; j++) { 1123 PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1124 for (k=bjstart; k<bjend; k++) { 1125 bcol = bj[k]; 1126 if (!seen[bcol]) { /* new entry */ 1127 PetscInt *PETSC_RESTRICT slot; 1128 ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1129 *slot = bcol; 1130 seen[bcol] = 1; 1131 packlen++; 1132 } 1133 } 1134 } 1135 1136 /* Check i-th diagonal entry */ 1137 if (C->force_diagonals && !seen[i]) { 1138 PetscInt *PETSC_RESTRICT slot; 1139 ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1140 *slot = i; 1141 seen[i] = 1; 1142 packlen++; 1143 } 1144 1145 ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr); 1146 ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr); 1147 ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr); 1148 ci[i+1] = ci[i] + packlen; 1149 for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1150 } 1151 ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr); 1152 ierr = PetscFree(seen);CHKERRQ(ierr); 1153 1154 /* Column indices are in the segmented buffer */ 1155 ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr); 1156 ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr); 1157 1158 /* put together the new symbolic matrix */ 1159 ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 1160 ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1161 1162 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1163 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1164 c = (Mat_SeqAIJ*)(C->data); 1165 c->free_a = PETSC_TRUE; 1166 c->free_ij = PETSC_TRUE; 1167 c->nonew = 0; 1168 1169 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1170 1171 /* set MatInfo */ 1172 afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5; 1173 if (afill < 1.0) afill = 1.0; 1174 c->maxnz = ci[am]; 1175 c->nz = ci[am]; 1176 C->info.mallocs = ndouble; 1177 C->info.fill_ratio_given = fill; 1178 C->info.fill_ratio_needed = afill; 1179 1180 #if defined(PETSC_USE_INFO) 1181 if (ci[am]) { 1182 ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 1183 ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1184 } else { 1185 ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1186 } 1187 #endif 1188 PetscFunctionReturn(0); 1189 } 1190 1191 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1192 { 1193 PetscErrorCode ierr; 1194 Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data; 1195 1196 PetscFunctionBegin; 1197 ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr); 1198 ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr); 1199 ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr); 1200 ierr = PetscFree(abt);CHKERRQ(ierr); 1201 PetscFunctionReturn(0); 1202 } 1203 1204 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1205 { 1206 PetscErrorCode ierr; 1207 Mat Bt; 1208 PetscInt *bti,*btj; 1209 Mat_MatMatTransMult *abt; 1210 Mat_Product *product = C->product; 1211 char *alg; 1212 1213 PetscFunctionBegin; 1214 if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1215 if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 1216 1217 /* create symbolic Bt */ 1218 ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 1219 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr); 1220 ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 1221 ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr); 1222 1223 /* get symbolic C=A*Bt */ 1224 ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr); 1225 ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */ 1226 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr); 1227 ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */ 1228 ierr = PetscFree(alg);CHKERRQ(ierr); 1229 1230 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1231 ierr = PetscNew(&abt);CHKERRQ(ierr); 1232 1233 product->data = abt; 1234 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1235 1236 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1237 1238 abt->usecoloring = PETSC_FALSE; 1239 ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr); 1240 if (abt->usecoloring) { 1241 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1242 MatTransposeColoring matcoloring; 1243 MatColoring coloring; 1244 ISColoring iscoloring; 1245 Mat Bt_dense,C_dense; 1246 1247 /* inode causes memory problem */ 1248 ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr); 1249 1250 ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr); 1251 ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr); 1252 ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr); 1253 ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr); 1254 ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr); 1255 ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr); 1256 ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr); 1257 1258 abt->matcoloring = matcoloring; 1259 1260 ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr); 1261 1262 /* Create Bt_dense and C_dense = A*Bt_dense */ 1263 ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr); 1264 ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr); 1265 ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr); 1266 ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr); 1267 1268 Bt_dense->assembled = PETSC_TRUE; 1269 abt->Bt_den = Bt_dense; 1270 1271 ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr); 1272 ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr); 1273 ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr); 1274 ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr); 1275 1276 Bt_dense->assembled = PETSC_TRUE; 1277 abt->ABt_den = C_dense; 1278 1279 #if defined(PETSC_USE_INFO) 1280 { 1281 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1282 ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr); 1283 } 1284 #endif 1285 } 1286 /* clean up */ 1287 ierr = MatDestroy(&Bt);CHKERRQ(ierr); 1288 ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 1289 PetscFunctionReturn(0); 1290 } 1291 1292 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1293 { 1294 PetscErrorCode ierr; 1295 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1296 PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 1297 PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 1298 PetscLogDouble flops=0.0; 1299 MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 1300 Mat_MatMatTransMult *abt; 1301 Mat_Product *product = C->product; 1302 1303 PetscFunctionBegin; 1304 if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1305 abt = (Mat_MatMatTransMult *)product->data; 1306 if (!abt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1307 /* clear old values in C */ 1308 if (!c->a) { 1309 ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 1310 c->a = ca; 1311 c->free_a = PETSC_TRUE; 1312 } else { 1313 ca = c->a; 1314 ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr); 1315 } 1316 1317 if (abt->usecoloring) { 1318 MatTransposeColoring matcoloring = abt->matcoloring; 1319 Mat Bt_dense,C_dense = abt->ABt_den; 1320 1321 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1322 Bt_dense = abt->Bt_den; 1323 ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr); 1324 1325 /* C_dense = A*Bt_dense */ 1326 ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr); 1327 1328 /* Recover C from C_dense */ 1329 ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr); 1330 PetscFunctionReturn(0); 1331 } 1332 1333 for (i=0; i<cm; i++) { 1334 anzi = ai[i+1] - ai[i]; 1335 acol = aj + ai[i]; 1336 aval = aa + ai[i]; 1337 cnzi = ci[i+1] - ci[i]; 1338 ccol = cj + ci[i]; 1339 cval = ca + ci[i]; 1340 for (j=0; j<cnzi; j++) { 1341 brow = ccol[j]; 1342 bnzj = bi[brow+1] - bi[brow]; 1343 bcol = bj + bi[brow]; 1344 bval = ba + bi[brow]; 1345 1346 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1347 nexta = 0; nextb = 0; 1348 while (nexta<anzi && nextb<bnzj) { 1349 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1350 if (nexta == anzi) break; 1351 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1352 if (nextb == bnzj) break; 1353 if (acol[nexta] == bcol[nextb]) { 1354 cval[j] += aval[nexta]*bval[nextb]; 1355 nexta++; nextb++; 1356 flops += 2; 1357 } 1358 } 1359 } 1360 } 1361 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1362 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1363 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1364 PetscFunctionReturn(0); 1365 } 1366 1367 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1368 { 1369 PetscErrorCode ierr; 1370 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data; 1371 1372 PetscFunctionBegin; 1373 ierr = MatDestroy(&atb->At);CHKERRQ(ierr); 1374 if (atb->destroy) { 1375 ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr); 1376 } 1377 ierr = PetscFree(atb);CHKERRQ(ierr); 1378 PetscFunctionReturn(0); 1379 } 1380 1381 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1382 { 1383 PetscErrorCode ierr; 1384 Mat At = NULL; 1385 PetscInt *ati,*atj; 1386 Mat_Product *product = C->product; 1387 PetscBool flg,def,square; 1388 1389 PetscFunctionBegin; 1390 MatCheckProduct(C,4); 1391 square = (PetscBool)(A == B && A->symmetric && A->symmetric_set); 1392 /* outerproduct */ 1393 ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr); 1394 if (flg) { 1395 /* create symbolic At */ 1396 if (!square) { 1397 ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 1398 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr); 1399 ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 1400 ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr); 1401 } 1402 /* get symbolic C=At*B */ 1403 ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1404 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 1405 1406 /* clean up */ 1407 if (!square) { 1408 ierr = MatDestroy(&At);CHKERRQ(ierr); 1409 ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 1410 } 1411 1412 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1413 ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr); 1414 PetscFunctionReturn(0); 1415 } 1416 1417 /* matmatmult */ 1418 ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr); 1419 ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr); 1420 if (flg || def) { 1421 Mat_MatTransMatMult *atb; 1422 1423 if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 1424 ierr = PetscNew(&atb);CHKERRQ(ierr); 1425 if (!square) { 1426 ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr); 1427 } 1428 ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1429 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 1430 ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr); 1431 product->data = atb; 1432 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1433 atb->At = At; 1434 atb->updateAt = PETSC_FALSE; /* because At is computed here */ 1435 1436 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1437 PetscFunctionReturn(0); 1438 } 1439 1440 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1441 } 1442 1443 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1444 { 1445 PetscErrorCode ierr; 1446 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1447 PetscInt am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1448 PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1449 PetscLogDouble flops=0.0; 1450 MatScalar *aa=a->a,*ba,*ca,*caj; 1451 1452 PetscFunctionBegin; 1453 if (!c->a) { 1454 ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 1455 1456 c->a = ca; 1457 c->free_a = PETSC_TRUE; 1458 } else { 1459 ca = c->a; 1460 ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 1461 } 1462 1463 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1464 for (i=0; i<am; i++) { 1465 bj = b->j + bi[i]; 1466 ba = b->a + bi[i]; 1467 bnzi = bi[i+1] - bi[i]; 1468 anzi = ai[i+1] - ai[i]; 1469 for (j=0; j<anzi; j++) { 1470 nextb = 0; 1471 crow = *aj++; 1472 cjj = cj + ci[crow]; 1473 caj = ca + ci[crow]; 1474 /* perform sparse axpy operation. Note cjj includes bj. */ 1475 for (k=0; nextb<bnzi; k++) { 1476 if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 1477 caj[k] += (*aa)*(*(ba+nextb)); 1478 nextb++; 1479 } 1480 } 1481 flops += 2*bnzi; 1482 aa++; 1483 } 1484 } 1485 1486 /* Assemble the final matrix and clean up */ 1487 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1488 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1489 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1490 PetscFunctionReturn(0); 1491 } 1492 1493 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 1494 { 1495 PetscErrorCode ierr; 1496 1497 PetscFunctionBegin; 1498 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 1499 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1500 PetscFunctionReturn(0); 1501 } 1502 1503 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add) 1504 { 1505 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1506 Mat_SeqDense *bd=(Mat_SeqDense*)B->data; 1507 Mat_SeqDense *cd=(Mat_SeqDense*)C->data; 1508 PetscErrorCode ierr; 1509 PetscScalar *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4; 1510 const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av; 1511 const PetscInt *aj; 1512 PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n; 1513 PetscInt clda=cd->lda; 1514 PetscInt am4=4*clda,bm4=4*bm,col,i,j,n; 1515 1516 PetscFunctionBegin; 1517 if (!cm || !cn) PetscFunctionReturn(0); 1518 ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr); 1519 if (add) { 1520 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1521 } else { 1522 ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr); 1523 } 1524 ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1525 b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 1526 c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda; 1527 for (col=0; col<(cn/4)*4; col += 4) { /* over columns of C */ 1528 for (i=0; i<am; i++) { /* over rows of A in those columns */ 1529 r1 = r2 = r3 = r4 = 0.0; 1530 n = a->i[i+1] - a->i[i]; 1531 aj = a->j + a->i[i]; 1532 aa = av + a->i[i]; 1533 for (j=0; j<n; j++) { 1534 const PetscScalar aatmp = aa[j]; 1535 const PetscInt ajtmp = aj[j]; 1536 r1 += aatmp*b1[ajtmp]; 1537 r2 += aatmp*b2[ajtmp]; 1538 r3 += aatmp*b3[ajtmp]; 1539 r4 += aatmp*b4[ajtmp]; 1540 } 1541 if (add) { 1542 c1[i] += r1; 1543 c2[i] += r2; 1544 c3[i] += r3; 1545 c4[i] += r4; 1546 } else { 1547 c1[i] = r1; 1548 c2[i] = r2; 1549 c3[i] = r3; 1550 c4[i] = r4; 1551 } 1552 } 1553 b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1554 c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1555 } 1556 /* process remaining columns */ 1557 if (col != cn) { 1558 PetscInt rc = cn-col; 1559 1560 if (rc == 1) { 1561 for (i=0; i<am; i++) { 1562 r1 = 0.0; 1563 n = a->i[i+1] - a->i[i]; 1564 aj = a->j + a->i[i]; 1565 aa = av + a->i[i]; 1566 for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]]; 1567 if (add) c1[i] += r1; 1568 else c1[i] = r1; 1569 } 1570 } else if (rc == 2) { 1571 for (i=0; i<am; i++) { 1572 r1 = r2 = 0.0; 1573 n = a->i[i+1] - a->i[i]; 1574 aj = a->j + a->i[i]; 1575 aa = av + a->i[i]; 1576 for (j=0; j<n; j++) { 1577 const PetscScalar aatmp = aa[j]; 1578 const PetscInt ajtmp = aj[j]; 1579 r1 += aatmp*b1[ajtmp]; 1580 r2 += aatmp*b2[ajtmp]; 1581 } 1582 if (add) { 1583 c1[i] += r1; 1584 c2[i] += r2; 1585 } else { 1586 c1[i] = r1; 1587 c2[i] = r2; 1588 } 1589 } 1590 } else { 1591 for (i=0; i<am; i++) { 1592 r1 = r2 = r3 = 0.0; 1593 n = a->i[i+1] - a->i[i]; 1594 aj = a->j + a->i[i]; 1595 aa = av + a->i[i]; 1596 for (j=0; j<n; j++) { 1597 const PetscScalar aatmp = aa[j]; 1598 const PetscInt ajtmp = aj[j]; 1599 r1 += aatmp*b1[ajtmp]; 1600 r2 += aatmp*b2[ajtmp]; 1601 r3 += aatmp*b3[ajtmp]; 1602 } 1603 if (add) { 1604 c1[i] += r1; 1605 c2[i] += r2; 1606 c3[i] += r3; 1607 } else { 1608 c1[i] = r1; 1609 c2[i] = r2; 1610 c3[i] = r3; 1611 } 1612 } 1613 } 1614 } 1615 ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr); 1616 if (add) { 1617 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 1618 } else { 1619 ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr); 1620 } 1621 ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 1622 ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1627 { 1628 PetscErrorCode ierr; 1629 1630 PetscFunctionBegin; 1631 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 1632 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 1633 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 1634 1635 ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr); 1636 PetscFunctionReturn(0); 1637 } 1638 1639 /* ------------------------------------------------------- */ 1640 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1641 { 1642 PetscFunctionBegin; 1643 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1644 C->ops->productsymbolic = MatProductSymbolic_AB; 1645 PetscFunctionReturn(0); 1646 } 1647 1648 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat); 1649 1650 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1651 { 1652 PetscFunctionBegin; 1653 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1654 C->ops->productsymbolic = MatProductSymbolic_AtB; 1655 PetscFunctionReturn(0); 1656 } 1657 1658 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1659 { 1660 PetscFunctionBegin; 1661 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1662 C->ops->productsymbolic = MatProductSymbolic_ABt; 1663 PetscFunctionReturn(0); 1664 } 1665 1666 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1667 { 1668 PetscErrorCode ierr; 1669 Mat_Product *product = C->product; 1670 1671 PetscFunctionBegin; 1672 switch (product->type) { 1673 case MATPRODUCT_AB: 1674 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr); 1675 break; 1676 case MATPRODUCT_AtB: 1677 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr); 1678 break; 1679 case MATPRODUCT_ABt: 1680 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr); 1681 break; 1682 default: 1683 break; 1684 } 1685 PetscFunctionReturn(0); 1686 } 1687 /* ------------------------------------------------------- */ 1688 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1689 { 1690 PetscErrorCode ierr; 1691 Mat_Product *product = C->product; 1692 Mat A = product->A; 1693 PetscBool baij; 1694 1695 PetscFunctionBegin; 1696 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr); 1697 if (!baij) { /* A is seqsbaij */ 1698 PetscBool sbaij; 1699 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr); 1700 if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format"); 1701 1702 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1703 } else { /* A is seqbaij */ 1704 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1705 } 1706 1707 C->ops->productsymbolic = MatProductSymbolic_AB; 1708 PetscFunctionReturn(0); 1709 } 1710 1711 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1712 { 1713 PetscErrorCode ierr; 1714 Mat_Product *product = C->product; 1715 1716 PetscFunctionBegin; 1717 MatCheckProduct(C,1); 1718 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1719 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) { 1720 ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr); 1721 } 1722 PetscFunctionReturn(0); 1723 } 1724 1725 /* ------------------------------------------------------- */ 1726 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1727 { 1728 PetscFunctionBegin; 1729 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1730 C->ops->productsymbolic = MatProductSymbolic_AB; 1731 PetscFunctionReturn(0); 1732 } 1733 1734 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1735 { 1736 PetscErrorCode ierr; 1737 Mat_Product *product = C->product; 1738 1739 PetscFunctionBegin; 1740 if (product->type == MATPRODUCT_AB) { 1741 ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr); 1742 } 1743 PetscFunctionReturn(0); 1744 } 1745 /* ------------------------------------------------------- */ 1746 1747 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1748 { 1749 PetscErrorCode ierr; 1750 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1751 Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 1752 PetscInt *bi = b->i,*bj=b->j; 1753 PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 1754 MatScalar *btval,*btval_den,*ba=b->a; 1755 PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1756 1757 PetscFunctionBegin; 1758 btval_den=btdense->v; 1759 ierr = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr); 1760 for (k=0; k<ncolors; k++) { 1761 ncolumns = coloring->ncolumns[k]; 1762 for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1763 col = *(columns + colorforcol[k] + l); 1764 btcol = bj + bi[col]; 1765 btval = ba + bi[col]; 1766 anz = bi[col+1] - bi[col]; 1767 for (j=0; j<anz; j++) { 1768 brow = btcol[j]; 1769 btval_den[brow] = btval[j]; 1770 } 1771 } 1772 btval_den += m; 1773 } 1774 PetscFunctionReturn(0); 1775 } 1776 1777 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 1778 { 1779 PetscErrorCode ierr; 1780 Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 1781 const PetscScalar *ca_den,*ca_den_ptr; 1782 PetscScalar *ca=csp->a; 1783 PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1784 PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1785 PetscInt nrows,*row,*idx; 1786 PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 1787 1788 PetscFunctionBegin; 1789 ierr = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1790 1791 if (brows > 0) { 1792 PetscInt *lstart,row_end,row_start; 1793 lstart = matcoloring->lstart; 1794 ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr); 1795 1796 row_end = brows; 1797 if (row_end > m) row_end = m; 1798 for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1799 ca_den_ptr = ca_den; 1800 for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1801 nrows = matcoloring->nrows[k]; 1802 row = rows + colorforrow[k]; 1803 idx = den2sp + colorforrow[k]; 1804 for (l=lstart[k]; l<nrows; l++) { 1805 if (row[l] >= row_end) { 1806 lstart[k] = l; 1807 break; 1808 } else { 1809 ca[idx[l]] = ca_den_ptr[row[l]]; 1810 } 1811 } 1812 ca_den_ptr += m; 1813 } 1814 row_end += brows; 1815 if (row_end > m) row_end = m; 1816 } 1817 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1818 ca_den_ptr = ca_den; 1819 for (k=0; k<ncolors; k++) { 1820 nrows = matcoloring->nrows[k]; 1821 row = rows + colorforrow[k]; 1822 idx = den2sp + colorforrow[k]; 1823 for (l=0; l<nrows; l++) { 1824 ca[idx[l]] = ca_den_ptr[row[l]]; 1825 } 1826 ca_den_ptr += m; 1827 } 1828 } 1829 1830 ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1831 #if defined(PETSC_USE_INFO) 1832 if (matcoloring->brows > 0) { 1833 ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr); 1834 } else { 1835 ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr); 1836 } 1837 #endif 1838 PetscFunctionReturn(0); 1839 } 1840 1841 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1842 { 1843 PetscErrorCode ierr; 1844 PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 1845 const PetscInt *is,*ci,*cj,*row_idx; 1846 PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1847 IS *isa; 1848 Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1849 PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1850 PetscInt *colorforcol,*columns,*columns_i,brows; 1851 PetscBool flg; 1852 1853 PetscFunctionBegin; 1854 ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr); 1855 1856 /* bs >1 is not being tested yet! */ 1857 Nbs = mat->cmap->N/bs; 1858 c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1859 c->N = Nbs; 1860 c->m = c->M; 1861 c->rstart = 0; 1862 c->brows = 100; 1863 1864 c->ncolors = nis; 1865 ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr); 1866 ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr); 1867 ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr); 1868 1869 brows = c->brows; 1870 ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr); 1871 if (flg) c->brows = brows; 1872 if (brows > 0) { 1873 ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr); 1874 } 1875 1876 colorforrow[0] = 0; 1877 rows_i = rows; 1878 den2sp_i = den2sp; 1879 1880 ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr); 1881 ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr); 1882 1883 colorforcol[0] = 0; 1884 columns_i = columns; 1885 1886 /* get column-wise storage of mat */ 1887 ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1888 1889 cm = c->m; 1890 ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr); 1891 ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr); 1892 for (i=0; i<nis; i++) { /* loop over color */ 1893 ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr); 1894 ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr); 1895 1896 c->ncolumns[i] = n; 1897 if (n) { 1898 ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr); 1899 } 1900 colorforcol[i+1] = colorforcol[i] + n; 1901 columns_i += n; 1902 1903 /* fast, crude version requires O(N*N) work */ 1904 ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr); 1905 1906 for (j=0; j<n; j++) { /* loop over columns*/ 1907 col = is[j]; 1908 row_idx = cj + ci[col]; 1909 m = ci[col+1] - ci[col]; 1910 for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1911 idxhit[*row_idx] = spidx[ci[col] + k]; 1912 rowhit[*row_idx++] = col + 1; 1913 } 1914 } 1915 /* count the number of hits */ 1916 nrows = 0; 1917 for (j=0; j<cm; j++) { 1918 if (rowhit[j]) nrows++; 1919 } 1920 c->nrows[i] = nrows; 1921 colorforrow[i+1] = colorforrow[i] + nrows; 1922 1923 nrows = 0; 1924 for (j=0; j<cm; j++) { /* loop over rows */ 1925 if (rowhit[j]) { 1926 rows_i[nrows] = j; 1927 den2sp_i[nrows] = idxhit[j]; 1928 nrows++; 1929 } 1930 } 1931 den2sp_i += nrows; 1932 1933 ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr); 1934 rows_i += nrows; 1935 } 1936 ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1937 ierr = PetscFree(rowhit);CHKERRQ(ierr); 1938 ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr); 1939 if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]); 1940 1941 c->colorforrow = colorforrow; 1942 c->rows = rows; 1943 c->den2sp = den2sp; 1944 c->colorforcol = colorforcol; 1945 c->columns = columns; 1946 1947 ierr = PetscFree(idxhit);CHKERRQ(ierr); 1948 PetscFunctionReturn(0); 1949 } 1950 1951 /* --------------------------------------------------------------- */ 1952 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1953 { 1954 PetscErrorCode ierr; 1955 Mat_Product *product = C->product; 1956 Mat A=product->A,B=product->B; 1957 1958 PetscFunctionBegin; 1959 if (C->ops->mattransposemultnumeric) { 1960 /* Alg: "outerproduct" */ 1961 ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr); 1962 } else { 1963 /* Alg: "matmatmult" -- C = At*B */ 1964 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1965 Mat At; 1966 1967 if (!atb) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1968 At = atb->At; 1969 if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */ 1970 ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr); 1971 } 1972 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr); 1973 atb->updateAt = PETSC_TRUE; 1974 } 1975 PetscFunctionReturn(0); 1976 } 1977 1978 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1979 { 1980 PetscErrorCode ierr; 1981 Mat_Product *product = C->product; 1982 Mat A=product->A,B=product->B; 1983 PetscReal fill=product->fill; 1984 1985 PetscFunctionBegin; 1986 ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 1987 1988 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1989 PetscFunctionReturn(0); 1990 } 1991 1992 /* --------------------------------------------------------------- */ 1993 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1994 { 1995 PetscErrorCode ierr; 1996 Mat_Product *product = C->product; 1997 PetscInt alg = 0; /* default algorithm */ 1998 PetscBool flg = PETSC_FALSE; 1999 #if !defined(PETSC_HAVE_HYPRE) 2000 const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 2001 PetscInt nalg = 7; 2002 #else 2003 const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"}; 2004 PetscInt nalg = 8; 2005 #endif 2006 2007 PetscFunctionBegin; 2008 /* Set default algorithm */ 2009 ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 2010 if (flg) { 2011 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2012 } 2013 2014 /* Get runtime option */ 2015 if (product->api_user) { 2016 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2017 ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2018 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2019 } else { 2020 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2021 ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2022 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2023 } 2024 if (flg) { 2025 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2026 } 2027 2028 C->ops->productsymbolic = MatProductSymbolic_AB; 2029 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2030 PetscFunctionReturn(0); 2031 } 2032 2033 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2034 { 2035 PetscErrorCode ierr; 2036 Mat_Product *product = C->product; 2037 PetscInt alg = 0; /* default algorithm */ 2038 PetscBool flg = PETSC_FALSE; 2039 const char *algTypes[3] = {"default","at*b","outerproduct"}; 2040 PetscInt nalg = 3; 2041 2042 PetscFunctionBegin; 2043 /* Get runtime option */ 2044 if (product->api_user) { 2045 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2046 ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2047 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2048 } else { 2049 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2050 ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2051 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2052 } 2053 if (flg) { 2054 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2055 } 2056 2057 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2058 PetscFunctionReturn(0); 2059 } 2060 2061 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2062 { 2063 PetscErrorCode ierr; 2064 Mat_Product *product = C->product; 2065 PetscInt alg = 0; /* default algorithm */ 2066 PetscBool flg = PETSC_FALSE; 2067 const char *algTypes[2] = {"default","color"}; 2068 PetscInt nalg = 2; 2069 2070 PetscFunctionBegin; 2071 /* Set default algorithm */ 2072 ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 2073 if (!flg) { 2074 alg = 1; 2075 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2076 } 2077 2078 /* Get runtime option */ 2079 if (product->api_user) { 2080 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr); 2081 ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2082 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2083 } else { 2084 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr); 2085 ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2086 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2087 } 2088 if (flg) { 2089 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2090 } 2091 2092 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2093 C->ops->productsymbolic = MatProductSymbolic_ABt; 2094 PetscFunctionReturn(0); 2095 } 2096 2097 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2098 { 2099 PetscErrorCode ierr; 2100 Mat_Product *product = C->product; 2101 PetscBool flg = PETSC_FALSE; 2102 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2103 #if !defined(PETSC_HAVE_HYPRE) 2104 const char *algTypes[2] = {"scalable","rap"}; 2105 PetscInt nalg = 2; 2106 #else 2107 const char *algTypes[3] = {"scalable","rap","hypre"}; 2108 PetscInt nalg = 3; 2109 #endif 2110 2111 PetscFunctionBegin; 2112 /* Set default algorithm */ 2113 ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 2114 if (flg) { 2115 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2116 } 2117 2118 /* Get runtime option */ 2119 if (product->api_user) { 2120 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2121 ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2122 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2123 } else { 2124 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2125 ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2126 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2127 } 2128 if (flg) { 2129 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2130 } 2131 2132 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2133 PetscFunctionReturn(0); 2134 } 2135 2136 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2137 { 2138 PetscErrorCode ierr; 2139 Mat_Product *product = C->product; 2140 PetscBool flg = PETSC_FALSE; 2141 PetscInt alg = 0; /* default algorithm */ 2142 const char *algTypes[3] = {"r*a*rt","r*art","coloring_rart"}; 2143 PetscInt nalg = 3; 2144 2145 PetscFunctionBegin; 2146 /* Set default algorithm */ 2147 ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 2148 if (flg) { 2149 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2150 } 2151 2152 /* Get runtime option */ 2153 if (product->api_user) { 2154 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2155 ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2156 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2157 } else { 2158 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2159 ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 2160 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2161 } 2162 if (flg) { 2163 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2164 } 2165 2166 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2167 PetscFunctionReturn(0); 2168 } 2169 2170 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2171 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2172 { 2173 PetscErrorCode ierr; 2174 Mat_Product *product = C->product; 2175 PetscInt alg = 0; /* default algorithm */ 2176 PetscBool flg = PETSC_FALSE; 2177 const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 2178 PetscInt nalg = 7; 2179 2180 PetscFunctionBegin; 2181 /* Set default algorithm */ 2182 ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 2183 if (flg) { 2184 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2185 } 2186 2187 /* Get runtime option */ 2188 if (product->api_user) { 2189 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2190 ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2191 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2192 } else { 2193 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2194 ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 2195 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2196 } 2197 if (flg) { 2198 ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2199 } 2200 2201 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2202 C->ops->productsymbolic = MatProductSymbolic_ABC; 2203 PetscFunctionReturn(0); 2204 } 2205 2206 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2207 { 2208 PetscErrorCode ierr; 2209 Mat_Product *product = C->product; 2210 2211 PetscFunctionBegin; 2212 switch (product->type) { 2213 case MATPRODUCT_AB: 2214 ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr); 2215 break; 2216 case MATPRODUCT_AtB: 2217 ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr); 2218 break; 2219 case MATPRODUCT_ABt: 2220 ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr); 2221 break; 2222 case MATPRODUCT_PtAP: 2223 ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr); 2224 break; 2225 case MATPRODUCT_RARt: 2226 ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr); 2227 break; 2228 case MATPRODUCT_ABC: 2229 ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr); 2230 break; 2231 default: 2232 break; 2233 } 2234 PetscFunctionReturn(0); 2235 } 2236