1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 27 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n; ++j) { 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 64 } 65 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 66 } 67 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 68 ierr = PetscFree(nidx);CHKERRQ(ierr); 69 ierr = PetscFree(nidx2);CHKERRQ(ierr); 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 78 PetscErrorCode ierr; 79 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 81 const PetscInt *irow,*icol; 82 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83 PetscInt *aj = a->j,*ai = a->i; 84 MatScalar *mat_a; 85 Mat C; 86 PetscBool flag,sorted; 87 88 PetscFunctionBegin; 89 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 90 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 91 92 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 93 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 94 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 95 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 96 97 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 98 ssmap = smap; 99 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 100 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 101 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 102 /* determine lens of each row */ 103 for (i=0; i<nrows; i++) { 104 kstart = ai[irow[i]]; 105 kend = kstart + a->ilen[irow[i]]; 106 lens[i] = 0; 107 for (k=kstart; k<kend; k++) { 108 if (ssmap[aj[k]]) lens[i]++; 109 } 110 } 111 /* Create and fill new matrix */ 112 if (scall == MAT_REUSE_MATRIX) { 113 c = (Mat_SeqBAIJ*)((*B)->data); 114 115 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 116 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 117 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 118 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 119 C = *B; 120 } else { 121 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 122 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 123 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 124 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 125 } 126 c = (Mat_SeqBAIJ*)(C->data); 127 for (i=0; i<nrows; i++) { 128 row = irow[i]; 129 kstart = ai[row]; 130 kend = kstart + a->ilen[row]; 131 mat_i = c->i[i]; 132 mat_j = c->j + mat_i; 133 mat_a = c->a + mat_i*bs2; 134 mat_ilen = c->ilen + i; 135 for (k=kstart; k<kend; k++) { 136 if ((tcol=ssmap[a->j[k]])) { 137 *mat_j++ = tcol - 1; 138 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 139 mat_a += bs2; 140 (*mat_ilen)++; 141 } 142 } 143 } 144 145 /* Free work space */ 146 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 147 ierr = PetscFree(smap);CHKERRQ(ierr); 148 ierr = PetscFree(lens);CHKERRQ(ierr); 149 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 150 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 151 152 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 153 *B = C; 154 PetscFunctionReturn(0); 155 } 156 157 #undef __FUNCT__ 158 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 159 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 160 { 161 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 162 IS is1,is2; 163 PetscErrorCode ierr; 164 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 165 const PetscInt *irow,*icol; 166 167 PetscFunctionBegin; 168 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 169 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 170 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 171 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 172 173 /* Verify if the indices corespond to each element in a block 174 and form the IS with compressed IS */ 175 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 176 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 177 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 178 count = 0; 179 for (i=0; i<a->mbs; i++) { 180 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 181 if (vary[i]==bs) iary[count++] = i; 182 } 183 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 184 185 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 186 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 187 count = 0; 188 for (i=0; i<a->mbs; i++) { 189 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 190 if (vary[i]==bs) iary[count++] = i; 191 } 192 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 193 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 194 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 195 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 196 197 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 198 ierr = ISDestroy(&is1);CHKERRQ(ierr); 199 ierr = ISDestroy(&is2);CHKERRQ(ierr); 200 PetscFunctionReturn(0); 201 } 202 203 #undef __FUNCT__ 204 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 205 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 206 { 207 PetscErrorCode ierr; 208 PetscInt i; 209 210 PetscFunctionBegin; 211 if (scall == MAT_INITIAL_MATRIX) { 212 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 213 } 214 215 for (i=0; i<n; i++) { 216 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 217 } 218 PetscFunctionReturn(0); 219 } 220 221 222 /* -------------------------------------------------------*/ 223 /* Should check that shapes of vectors and matrices match */ 224 /* -------------------------------------------------------*/ 225 226 #undef __FUNCT__ 227 #define __FUNCT__ "MatMult_SeqBAIJ_1" 228 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 229 { 230 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 231 PetscScalar *z,sum; 232 const PetscScalar *x; 233 const MatScalar *v; 234 PetscErrorCode ierr; 235 PetscInt mbs,i,n,nonzerorow=0; 236 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 237 PetscBool usecprow=a->compressedrow.use; 238 239 PetscFunctionBegin; 240 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 241 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 242 243 if (usecprow) { 244 mbs = a->compressedrow.nrows; 245 ii = a->compressedrow.i; 246 ridx = a->compressedrow.rindex; 247 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 248 } else { 249 mbs = a->mbs; 250 ii = a->i; 251 } 252 253 for (i=0; i<mbs; i++) { 254 n = ii[1] - ii[0]; 255 v = a->a + ii[0]; 256 idx = a->j + ii[0]; 257 ii++; 258 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 259 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 260 sum = 0.0; 261 PetscSparseDensePlusDot(sum,x,v,idx,n); 262 if (usecprow) { 263 z[ridx[i]] = sum; 264 } else { 265 nonzerorow += (n>0); 266 z[i] = sum; 267 } 268 } 269 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 270 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 271 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 272 PetscFunctionReturn(0); 273 } 274 275 #undef __FUNCT__ 276 #define __FUNCT__ "MatMult_SeqBAIJ_2" 277 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 278 { 279 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 280 PetscScalar *z = 0,sum1,sum2,*zarray; 281 const PetscScalar *x,*xb; 282 PetscScalar x1,x2; 283 const MatScalar *v; 284 PetscErrorCode ierr; 285 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 286 PetscBool usecprow=a->compressedrow.use; 287 288 PetscFunctionBegin; 289 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 290 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 291 292 idx = a->j; 293 v = a->a; 294 if (usecprow) { 295 mbs = a->compressedrow.nrows; 296 ii = a->compressedrow.i; 297 ridx = a->compressedrow.rindex; 298 } else { 299 mbs = a->mbs; 300 ii = a->i; 301 z = zarray; 302 } 303 304 for (i=0; i<mbs; i++) { 305 n = ii[1] - ii[0]; ii++; 306 sum1 = 0.0; sum2 = 0.0; 307 nonzerorow += (n>0); 308 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 309 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 310 for (j=0; j<n; j++) { 311 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 312 sum1 += v[0]*x1 + v[2]*x2; 313 sum2 += v[1]*x1 + v[3]*x2; 314 v += 4; 315 } 316 if (usecprow) z = zarray + 2*ridx[i]; 317 z[0] = sum1; z[1] = sum2; 318 if (!usecprow) z += 2; 319 } 320 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 321 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 322 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 #undef __FUNCT__ 327 #define __FUNCT__ "MatMult_SeqBAIJ_3" 328 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 329 { 330 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 331 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 332 const PetscScalar *x,*xb; 333 const MatScalar *v; 334 PetscErrorCode ierr; 335 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 336 PetscBool usecprow=a->compressedrow.use; 337 338 339 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 340 #pragma disjoint(*v,*z,*xb) 341 #endif 342 343 PetscFunctionBegin; 344 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 345 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 346 347 idx = a->j; 348 v = a->a; 349 if (usecprow) { 350 mbs = a->compressedrow.nrows; 351 ii = a->compressedrow.i; 352 ridx = a->compressedrow.rindex; 353 } else { 354 mbs = a->mbs; 355 ii = a->i; 356 z = zarray; 357 } 358 359 for (i=0; i<mbs; i++) { 360 n = ii[1] - ii[0]; ii++; 361 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 362 nonzerorow += (n>0); 363 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 364 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 365 for (j=0; j<n; j++) { 366 xb = x + 3*(*idx++); 367 x1 = xb[0]; 368 x2 = xb[1]; 369 x3 = xb[2]; 370 371 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 372 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 373 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 374 v += 9; 375 } 376 if (usecprow) z = zarray + 3*ridx[i]; 377 z[0] = sum1; z[1] = sum2; z[2] = sum3; 378 if (!usecprow) z += 3; 379 } 380 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 381 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 382 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatMult_SeqBAIJ_4" 388 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 389 { 390 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 391 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 392 const PetscScalar *x,*xb; 393 const MatScalar *v; 394 PetscErrorCode ierr; 395 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 396 PetscBool usecprow=a->compressedrow.use; 397 398 PetscFunctionBegin; 399 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 400 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 401 402 idx = a->j; 403 v = a->a; 404 if (usecprow) { 405 mbs = a->compressedrow.nrows; 406 ii = a->compressedrow.i; 407 ridx = a->compressedrow.rindex; 408 } else { 409 mbs = a->mbs; 410 ii = a->i; 411 z = zarray; 412 } 413 414 for (i=0; i<mbs; i++) { 415 n = ii[1] - ii[0]; 416 ii++; 417 sum1 = 0.0; 418 sum2 = 0.0; 419 sum3 = 0.0; 420 sum4 = 0.0; 421 422 nonzerorow += (n>0); 423 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 424 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 425 for (j=0; j<n; j++) { 426 xb = x + 4*(*idx++); 427 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 428 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 429 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 430 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 431 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 432 v += 16; 433 } 434 if (usecprow) z = zarray + 4*ridx[i]; 435 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 436 if (!usecprow) z += 4; 437 } 438 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 439 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 440 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 441 PetscFunctionReturn(0); 442 } 443 444 #undef __FUNCT__ 445 #define __FUNCT__ "MatMult_SeqBAIJ_5" 446 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 447 { 448 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 449 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 450 const PetscScalar *xb,*x; 451 const MatScalar *v; 452 PetscErrorCode ierr; 453 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 454 PetscInt mbs,i,j,n,nonzerorow=0; 455 PetscBool usecprow=a->compressedrow.use; 456 457 PetscFunctionBegin; 458 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 459 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 460 461 idx = a->j; 462 v = a->a; 463 if (usecprow) { 464 mbs = a->compressedrow.nrows; 465 ii = a->compressedrow.i; 466 ridx = a->compressedrow.rindex; 467 } else { 468 mbs = a->mbs; 469 ii = a->i; 470 z = zarray; 471 } 472 473 for (i=0; i<mbs; i++) { 474 n = ii[1] - ii[0]; ii++; 475 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 476 nonzerorow += (n>0); 477 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 478 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 479 for (j=0; j<n; j++) { 480 xb = x + 5*(*idx++); 481 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 482 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 483 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 484 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 485 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 486 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 487 v += 25; 488 } 489 if (usecprow) z = zarray + 5*ridx[i]; 490 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 491 if (!usecprow) z += 5; 492 } 493 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 494 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 495 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 496 PetscFunctionReturn(0); 497 } 498 499 500 #undef __FUNCT__ 501 #define __FUNCT__ "MatMult_SeqBAIJ_6" 502 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 503 { 504 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 505 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 506 const PetscScalar *x,*xb; 507 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 508 const MatScalar *v; 509 PetscErrorCode ierr; 510 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 511 PetscBool usecprow=a->compressedrow.use; 512 513 PetscFunctionBegin; 514 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 515 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 516 517 idx = a->j; 518 v = a->a; 519 if (usecprow) { 520 mbs = a->compressedrow.nrows; 521 ii = a->compressedrow.i; 522 ridx = a->compressedrow.rindex; 523 } else { 524 mbs = a->mbs; 525 ii = a->i; 526 z = zarray; 527 } 528 529 for (i=0; i<mbs; i++) { 530 n = ii[1] - ii[0]; 531 ii++; 532 sum1 = 0.0; 533 sum2 = 0.0; 534 sum3 = 0.0; 535 sum4 = 0.0; 536 sum5 = 0.0; 537 sum6 = 0.0; 538 539 nonzerorow += (n>0); 540 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 541 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 542 for (j=0; j<n; j++) { 543 xb = x + 6*(*idx++); 544 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 545 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 546 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 547 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 548 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 549 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 550 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 551 v += 36; 552 } 553 if (usecprow) z = zarray + 6*ridx[i]; 554 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 555 if (!usecprow) z += 6; 556 } 557 558 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 559 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 560 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 561 PetscFunctionReturn(0); 562 } 563 564 #undef __FUNCT__ 565 #define __FUNCT__ "MatMult_SeqBAIJ_7" 566 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 567 { 568 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 569 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 570 const PetscScalar *x,*xb; 571 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 572 const MatScalar *v; 573 PetscErrorCode ierr; 574 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 575 PetscBool usecprow=a->compressedrow.use; 576 577 PetscFunctionBegin; 578 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 579 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 580 581 idx = a->j; 582 v = a->a; 583 if (usecprow) { 584 mbs = a->compressedrow.nrows; 585 ii = a->compressedrow.i; 586 ridx = a->compressedrow.rindex; 587 } else { 588 mbs = a->mbs; 589 ii = a->i; 590 z = zarray; 591 } 592 593 for (i=0; i<mbs; i++) { 594 n = ii[1] - ii[0]; 595 ii++; 596 sum1 = 0.0; 597 sum2 = 0.0; 598 sum3 = 0.0; 599 sum4 = 0.0; 600 sum5 = 0.0; 601 sum6 = 0.0; 602 sum7 = 0.0; 603 604 nonzerorow += (n>0); 605 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 606 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 607 for (j=0; j<n; j++) { 608 xb = x + 7*(*idx++); 609 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 610 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 611 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 612 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 613 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 614 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 615 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 616 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 617 v += 49; 618 } 619 if (usecprow) z = zarray + 7*ridx[i]; 620 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 621 if (!usecprow) z += 7; 622 } 623 624 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 625 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 626 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 627 PetscFunctionReturn(0); 628 } 629 630 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 631 /* Default MatMult for block size 15 */ 632 633 #undef __FUNCT__ 634 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 635 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 636 { 637 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 638 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 639 const PetscScalar *x,*xb; 640 PetscScalar *zarray,xv; 641 const MatScalar *v; 642 PetscErrorCode ierr; 643 const PetscInt *ii,*ij=a->j,*idx; 644 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 645 PetscBool usecprow=a->compressedrow.use; 646 647 PetscFunctionBegin; 648 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 649 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 650 651 v = a->a; 652 if (usecprow) { 653 mbs = a->compressedrow.nrows; 654 ii = a->compressedrow.i; 655 ridx = a->compressedrow.rindex; 656 } else { 657 mbs = a->mbs; 658 ii = a->i; 659 z = zarray; 660 } 661 662 for (i=0; i<mbs; i++) { 663 n = ii[i+1] - ii[i]; 664 idx = ij + ii[i]; 665 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 666 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 667 668 nonzerorow += (n>0); 669 for (j=0; j<n; j++) { 670 xb = x + 15*(idx[j]); 671 672 for (k=0; k<15; k++) { 673 xv = xb[k]; 674 sum1 += v[0]*xv; 675 sum2 += v[1]*xv; 676 sum3 += v[2]*xv; 677 sum4 += v[3]*xv; 678 sum5 += v[4]*xv; 679 sum6 += v[5]*xv; 680 sum7 += v[6]*xv; 681 sum8 += v[7]*xv; 682 sum9 += v[8]*xv; 683 sum10 += v[9]*xv; 684 sum11 += v[10]*xv; 685 sum12 += v[11]*xv; 686 sum13 += v[12]*xv; 687 sum14 += v[13]*xv; 688 sum15 += v[14]*xv; 689 v += 15; 690 } 691 } 692 if (usecprow) z = zarray + 15*ridx[i]; 693 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 694 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 695 696 if (!usecprow) z += 15; 697 } 698 699 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 700 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 701 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 702 PetscFunctionReturn(0); 703 } 704 705 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 706 #undef __FUNCT__ 707 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 708 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 709 { 710 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 711 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 712 const PetscScalar *x,*xb; 713 PetscScalar x1,x2,x3,x4,*zarray; 714 const MatScalar *v; 715 PetscErrorCode ierr; 716 const PetscInt *ii,*ij=a->j,*idx; 717 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 718 PetscBool usecprow=a->compressedrow.use; 719 720 PetscFunctionBegin; 721 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 722 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 723 724 v = a->a; 725 if (usecprow) { 726 mbs = a->compressedrow.nrows; 727 ii = a->compressedrow.i; 728 ridx = a->compressedrow.rindex; 729 } else { 730 mbs = a->mbs; 731 ii = a->i; 732 z = zarray; 733 } 734 735 for (i=0; i<mbs; i++) { 736 n = ii[i+1] - ii[i]; 737 idx = ij + ii[i]; 738 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 739 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 740 741 nonzerorow += (n>0); 742 for (j=0; j<n; j++) { 743 xb = x + 15*(idx[j]); 744 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 745 746 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 747 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 748 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 749 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 750 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 751 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 752 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 753 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 754 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 755 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 756 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 757 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 758 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 759 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 760 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 761 762 v += 60; 763 764 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 765 766 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 767 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 768 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 769 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 770 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 771 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 772 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 773 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 774 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 775 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 776 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 777 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 778 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 779 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 780 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 781 v += 60; 782 783 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 784 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 785 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 786 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 787 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 788 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 789 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 790 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 791 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 792 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 793 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 794 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 795 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 796 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 797 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 798 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 799 v += 60; 800 801 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 802 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 803 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 804 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 805 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 806 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 807 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 808 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 809 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 810 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 811 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 812 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 813 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 814 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 815 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 816 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 817 v += 45; 818 } 819 if (usecprow) z = zarray + 15*ridx[i]; 820 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 821 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 822 823 if (!usecprow) z += 15; 824 } 825 826 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 827 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 828 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 829 PetscFunctionReturn(0); 830 } 831 832 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 833 #undef __FUNCT__ 834 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 835 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 836 { 837 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 838 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 839 const PetscScalar *x,*xb; 840 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 841 const MatScalar *v; 842 PetscErrorCode ierr; 843 const PetscInt *ii,*ij=a->j,*idx; 844 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 845 PetscBool usecprow=a->compressedrow.use; 846 847 PetscFunctionBegin; 848 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 849 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 850 851 v = a->a; 852 if (usecprow) { 853 mbs = a->compressedrow.nrows; 854 ii = a->compressedrow.i; 855 ridx = a->compressedrow.rindex; 856 } else { 857 mbs = a->mbs; 858 ii = a->i; 859 z = zarray; 860 } 861 862 for (i=0; i<mbs; i++) { 863 n = ii[i+1] - ii[i]; 864 idx = ij + ii[i]; 865 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 866 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 867 868 nonzerorow += (n>0); 869 for (j=0; j<n; j++) { 870 xb = x + 15*(idx[j]); 871 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 872 x8 = xb[7]; 873 874 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 875 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 876 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 877 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 878 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 879 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 880 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 881 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 882 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 883 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 884 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 885 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 886 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 887 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 888 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 889 v += 120; 890 891 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 892 893 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 894 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 895 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 896 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 897 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 898 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 899 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 900 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 901 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 902 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 903 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 904 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 905 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 906 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 907 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 908 v += 105; 909 } 910 if (usecprow) z = zarray + 15*ridx[i]; 911 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 912 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 913 914 if (!usecprow) z += 15; 915 } 916 917 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 918 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 919 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 920 PetscFunctionReturn(0); 921 } 922 923 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 924 925 #undef __FUNCT__ 926 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 927 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 928 { 929 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 930 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 931 const PetscScalar *x,*xb; 932 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 933 const MatScalar *v; 934 PetscErrorCode ierr; 935 const PetscInt *ii,*ij=a->j,*idx; 936 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 937 PetscBool usecprow=a->compressedrow.use; 938 939 PetscFunctionBegin; 940 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 941 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 942 943 v = a->a; 944 if (usecprow) { 945 mbs = a->compressedrow.nrows; 946 ii = a->compressedrow.i; 947 ridx = a->compressedrow.rindex; 948 } else { 949 mbs = a->mbs; 950 ii = a->i; 951 z = zarray; 952 } 953 954 for (i=0; i<mbs; i++) { 955 n = ii[i+1] - ii[i]; 956 idx = ij + ii[i]; 957 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 958 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 959 960 nonzerorow += (n>0); 961 for (j=0; j<n; j++) { 962 xb = x + 15*(idx[j]); 963 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 964 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 965 966 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 967 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 968 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 969 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 970 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 971 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 972 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 973 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 974 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 975 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 976 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 977 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 978 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 979 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 980 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 981 v += 225; 982 } 983 if (usecprow) z = zarray + 15*ridx[i]; 984 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 985 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 986 987 if (!usecprow) z += 15; 988 } 989 990 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 991 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 992 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 993 PetscFunctionReturn(0); 994 } 995 996 997 /* 998 This will not work with MatScalar == float because it calls the BLAS 999 */ 1000 #undef __FUNCT__ 1001 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1002 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1003 { 1004 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1005 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1006 MatScalar *v; 1007 PetscErrorCode ierr; 1008 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1009 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 1010 PetscBool usecprow=a->compressedrow.use; 1011 1012 PetscFunctionBegin; 1013 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1014 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1015 1016 idx = a->j; 1017 v = a->a; 1018 if (usecprow) { 1019 mbs = a->compressedrow.nrows; 1020 ii = a->compressedrow.i; 1021 ridx = a->compressedrow.rindex; 1022 } else { 1023 mbs = a->mbs; 1024 ii = a->i; 1025 z = zarray; 1026 } 1027 1028 if (!a->mult_work) { 1029 k = PetscMax(A->rmap->n,A->cmap->n); 1030 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1031 } 1032 work = a->mult_work; 1033 for (i=0; i<mbs; i++) { 1034 n = ii[1] - ii[0]; ii++; 1035 ncols = n*bs; 1036 workt = work; 1037 nonzerorow += (n>0); 1038 for (j=0; j<n; j++) { 1039 xb = x + bs*(*idx++); 1040 for (k=0; k<bs; k++) workt[k] = xb[k]; 1041 workt += bs; 1042 } 1043 if (usecprow) z = zarray + bs*ridx[i]; 1044 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1045 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1046 v += n*bs2; 1047 if (!usecprow) z += bs; 1048 } 1049 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1050 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1051 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1052 PetscFunctionReturn(0); 1053 } 1054 1055 #undef __FUNCT__ 1056 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1057 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1058 { 1059 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1060 const PetscScalar *x; 1061 PetscScalar *y,*z,sum; 1062 const MatScalar *v; 1063 PetscErrorCode ierr; 1064 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1065 const PetscInt *idx,*ii; 1066 PetscBool usecprow=a->compressedrow.use; 1067 1068 PetscFunctionBegin; 1069 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1070 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1071 if (zz != yy) { 1072 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1073 } else { 1074 z = y; 1075 } 1076 1077 idx = a->j; 1078 v = a->a; 1079 if (usecprow) { 1080 if (zz != yy) { 1081 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1082 } 1083 mbs = a->compressedrow.nrows; 1084 ii = a->compressedrow.i; 1085 ridx = a->compressedrow.rindex; 1086 } else { 1087 ii = a->i; 1088 } 1089 1090 for (i=0; i<mbs; i++) { 1091 n = ii[1] - ii[0]; 1092 ii++; 1093 if (!usecprow) { 1094 nonzerorow += (n>0); 1095 sum = y[i]; 1096 } else { 1097 sum = y[ridx[i]]; 1098 } 1099 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1100 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1101 PetscSparseDensePlusDot(sum,x,v,idx,n); 1102 v += n; 1103 idx += n; 1104 if (usecprow) { 1105 z[ridx[i]] = sum; 1106 } else { 1107 z[i] = sum; 1108 } 1109 } 1110 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1111 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1112 if (zz != yy) { 1113 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1114 } 1115 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1116 PetscFunctionReturn(0); 1117 } 1118 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1121 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1122 { 1123 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1124 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1125 PetscScalar x1,x2,*yarray,*zarray; 1126 MatScalar *v; 1127 PetscErrorCode ierr; 1128 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1129 PetscBool usecprow=a->compressedrow.use; 1130 1131 PetscFunctionBegin; 1132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1133 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1134 if (zz != yy) { 1135 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1136 } else { 1137 zarray = yarray; 1138 } 1139 1140 idx = a->j; 1141 v = a->a; 1142 if (usecprow) { 1143 if (zz != yy) { 1144 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1145 } 1146 mbs = a->compressedrow.nrows; 1147 ii = a->compressedrow.i; 1148 ridx = a->compressedrow.rindex; 1149 if (zz != yy) { 1150 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1151 } 1152 } else { 1153 ii = a->i; 1154 y = yarray; 1155 z = zarray; 1156 } 1157 1158 for (i=0; i<mbs; i++) { 1159 n = ii[1] - ii[0]; ii++; 1160 if (usecprow) { 1161 z = zarray + 2*ridx[i]; 1162 y = yarray + 2*ridx[i]; 1163 } 1164 sum1 = y[0]; sum2 = y[1]; 1165 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1166 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1167 for (j=0; j<n; j++) { 1168 xb = x + 2*(*idx++); 1169 x1 = xb[0]; 1170 x2 = xb[1]; 1171 1172 sum1 += v[0]*x1 + v[2]*x2; 1173 sum2 += v[1]*x1 + v[3]*x2; 1174 v += 4; 1175 } 1176 z[0] = sum1; z[1] = sum2; 1177 if (!usecprow) { 1178 z += 2; y += 2; 1179 } 1180 } 1181 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1182 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1183 if (zz != yy) { 1184 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1185 } 1186 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1187 PetscFunctionReturn(0); 1188 } 1189 1190 #undef __FUNCT__ 1191 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1192 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1193 { 1194 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1195 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1196 MatScalar *v; 1197 PetscErrorCode ierr; 1198 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1199 PetscBool usecprow=a->compressedrow.use; 1200 1201 PetscFunctionBegin; 1202 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1203 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1204 if (zz != yy) { 1205 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1206 } else { 1207 zarray = yarray; 1208 } 1209 1210 idx = a->j; 1211 v = a->a; 1212 if (usecprow) { 1213 if (zz != yy) { 1214 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1215 } 1216 mbs = a->compressedrow.nrows; 1217 ii = a->compressedrow.i; 1218 ridx = a->compressedrow.rindex; 1219 } else { 1220 ii = a->i; 1221 y = yarray; 1222 z = zarray; 1223 } 1224 1225 for (i=0; i<mbs; i++) { 1226 n = ii[1] - ii[0]; ii++; 1227 if (usecprow) { 1228 z = zarray + 3*ridx[i]; 1229 y = yarray + 3*ridx[i]; 1230 } 1231 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1232 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1233 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1234 for (j=0; j<n; j++) { 1235 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1236 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1237 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1238 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1239 v += 9; 1240 } 1241 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1242 if (!usecprow) { 1243 z += 3; y += 3; 1244 } 1245 } 1246 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1247 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1248 if (zz != yy) { 1249 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1250 } 1251 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1252 PetscFunctionReturn(0); 1253 } 1254 1255 #undef __FUNCT__ 1256 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1257 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1258 { 1259 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1260 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1261 MatScalar *v; 1262 PetscErrorCode ierr; 1263 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1264 PetscBool usecprow=a->compressedrow.use; 1265 1266 PetscFunctionBegin; 1267 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1268 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1269 if (zz != yy) { 1270 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1271 } else { 1272 zarray = yarray; 1273 } 1274 1275 idx = a->j; 1276 v = a->a; 1277 if (usecprow) { 1278 if (zz != yy) { 1279 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1280 } 1281 mbs = a->compressedrow.nrows; 1282 ii = a->compressedrow.i; 1283 ridx = a->compressedrow.rindex; 1284 } else { 1285 ii = a->i; 1286 y = yarray; 1287 z = zarray; 1288 } 1289 1290 for (i=0; i<mbs; i++) { 1291 n = ii[1] - ii[0]; ii++; 1292 if (usecprow) { 1293 z = zarray + 4*ridx[i]; 1294 y = yarray + 4*ridx[i]; 1295 } 1296 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1297 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1298 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1299 for (j=0; j<n; j++) { 1300 xb = x + 4*(*idx++); 1301 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1302 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1303 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1304 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1305 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1306 v += 16; 1307 } 1308 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1309 if (!usecprow) { 1310 z += 4; y += 4; 1311 } 1312 } 1313 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1314 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1315 if (zz != yy) { 1316 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1317 } 1318 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1319 PetscFunctionReturn(0); 1320 } 1321 1322 #undef __FUNCT__ 1323 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1324 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1325 { 1326 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1327 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1328 PetscScalar *yarray,*zarray; 1329 MatScalar *v; 1330 PetscErrorCode ierr; 1331 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1332 PetscBool usecprow=a->compressedrow.use; 1333 1334 PetscFunctionBegin; 1335 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1336 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1337 if (zz != yy) { 1338 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1339 } else { 1340 zarray = yarray; 1341 } 1342 1343 idx = a->j; 1344 v = a->a; 1345 if (usecprow) { 1346 if (zz != yy) { 1347 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1348 } 1349 mbs = a->compressedrow.nrows; 1350 ii = a->compressedrow.i; 1351 ridx = a->compressedrow.rindex; 1352 } else { 1353 ii = a->i; 1354 y = yarray; 1355 z = zarray; 1356 } 1357 1358 for (i=0; i<mbs; i++) { 1359 n = ii[1] - ii[0]; ii++; 1360 if (usecprow) { 1361 z = zarray + 5*ridx[i]; 1362 y = yarray + 5*ridx[i]; 1363 } 1364 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1365 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1366 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1367 for (j=0; j<n; j++) { 1368 xb = x + 5*(*idx++); 1369 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1370 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1371 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1372 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1373 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1374 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1375 v += 25; 1376 } 1377 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1378 if (!usecprow) { 1379 z += 5; y += 5; 1380 } 1381 } 1382 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1383 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1384 if (zz != yy) { 1385 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1386 } 1387 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1388 PetscFunctionReturn(0); 1389 } 1390 #undef __FUNCT__ 1391 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1392 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1393 { 1394 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1395 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1396 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1397 MatScalar *v; 1398 PetscErrorCode ierr; 1399 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1400 PetscBool usecprow=a->compressedrow.use; 1401 1402 PetscFunctionBegin; 1403 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1404 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1405 if (zz != yy) { 1406 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1407 } else { 1408 zarray = yarray; 1409 } 1410 1411 idx = a->j; 1412 v = a->a; 1413 if (usecprow) { 1414 if (zz != yy) { 1415 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1416 } 1417 mbs = a->compressedrow.nrows; 1418 ii = a->compressedrow.i; 1419 ridx = a->compressedrow.rindex; 1420 } else { 1421 ii = a->i; 1422 y = yarray; 1423 z = zarray; 1424 } 1425 1426 for (i=0; i<mbs; i++) { 1427 n = ii[1] - ii[0]; ii++; 1428 if (usecprow) { 1429 z = zarray + 6*ridx[i]; 1430 y = yarray + 6*ridx[i]; 1431 } 1432 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1433 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1434 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1435 for (j=0; j<n; j++) { 1436 xb = x + 6*(*idx++); 1437 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1438 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1439 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1440 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1441 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1442 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1443 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1444 v += 36; 1445 } 1446 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1447 if (!usecprow) { 1448 z += 6; y += 6; 1449 } 1450 } 1451 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1452 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1453 if (zz != yy) { 1454 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1455 } 1456 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1457 PetscFunctionReturn(0); 1458 } 1459 1460 #undef __FUNCT__ 1461 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1462 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1463 { 1464 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1465 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1466 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1467 MatScalar *v; 1468 PetscErrorCode ierr; 1469 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1470 PetscBool usecprow=a->compressedrow.use; 1471 1472 PetscFunctionBegin; 1473 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1474 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1475 if (zz != yy) { 1476 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1477 } else { 1478 zarray = yarray; 1479 } 1480 1481 idx = a->j; 1482 v = a->a; 1483 if (usecprow) { 1484 if (zz != yy) { 1485 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1486 } 1487 mbs = a->compressedrow.nrows; 1488 ii = a->compressedrow.i; 1489 ridx = a->compressedrow.rindex; 1490 } else { 1491 ii = a->i; 1492 y = yarray; 1493 z = zarray; 1494 } 1495 1496 for (i=0; i<mbs; i++) { 1497 n = ii[1] - ii[0]; ii++; 1498 if (usecprow) { 1499 z = zarray + 7*ridx[i]; 1500 y = yarray + 7*ridx[i]; 1501 } 1502 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1503 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1504 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1505 for (j=0; j<n; j++) { 1506 xb = x + 7*(*idx++); 1507 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1508 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1509 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1510 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1511 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1512 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1513 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1514 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1515 v += 49; 1516 } 1517 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1518 if (!usecprow) { 1519 z += 7; y += 7; 1520 } 1521 } 1522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1523 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1524 if (zz != yy) { 1525 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1526 } 1527 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1528 PetscFunctionReturn(0); 1529 } 1530 1531 #undef __FUNCT__ 1532 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1533 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1534 { 1535 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1536 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1537 MatScalar *v; 1538 PetscErrorCode ierr; 1539 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1540 PetscInt ncols,k,*ridx=PETSC_NULL; 1541 PetscBool usecprow=a->compressedrow.use; 1542 1543 PetscFunctionBegin; 1544 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1545 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1546 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1547 1548 idx = a->j; 1549 v = a->a; 1550 if (usecprow) { 1551 mbs = a->compressedrow.nrows; 1552 ii = a->compressedrow.i; 1553 ridx = a->compressedrow.rindex; 1554 } else { 1555 mbs = a->mbs; 1556 ii = a->i; 1557 z = zarray; 1558 } 1559 1560 if (!a->mult_work) { 1561 k = PetscMax(A->rmap->n,A->cmap->n); 1562 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1563 } 1564 work = a->mult_work; 1565 for (i=0; i<mbs; i++) { 1566 n = ii[1] - ii[0]; ii++; 1567 ncols = n*bs; 1568 workt = work; 1569 for (j=0; j<n; j++) { 1570 xb = x + bs*(*idx++); 1571 for (k=0; k<bs; k++) workt[k] = xb[k]; 1572 workt += bs; 1573 } 1574 if (usecprow) z = zarray + bs*ridx[i]; 1575 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1576 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1577 v += n*bs2; 1578 if (!usecprow) z += bs; 1579 } 1580 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1581 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1582 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 #undef __FUNCT__ 1587 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1588 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1589 { 1590 PetscScalar zero = 0.0; 1591 PetscErrorCode ierr; 1592 1593 PetscFunctionBegin; 1594 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1595 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 #undef __FUNCT__ 1600 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1601 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1602 { 1603 PetscScalar zero = 0.0; 1604 PetscErrorCode ierr; 1605 1606 PetscFunctionBegin; 1607 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1608 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 #undef __FUNCT__ 1613 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1614 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1615 { 1616 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1617 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1618 MatScalar *v; 1619 PetscErrorCode ierr; 1620 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1621 Mat_CompressedRow cprow = a->compressedrow; 1622 PetscBool usecprow=cprow.use; 1623 1624 PetscFunctionBegin; 1625 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1626 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1627 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1628 1629 idx = a->j; 1630 v = a->a; 1631 if (usecprow) { 1632 mbs = cprow.nrows; 1633 ii = cprow.i; 1634 ridx = cprow.rindex; 1635 } else { 1636 mbs=a->mbs; 1637 ii = a->i; 1638 xb = x; 1639 } 1640 1641 switch (bs) { 1642 case 1: 1643 for (i=0; i<mbs; i++) { 1644 if (usecprow) xb = x + ridx[i]; 1645 x1 = xb[0]; 1646 ib = idx + ii[0]; 1647 n = ii[1] - ii[0]; ii++; 1648 for (j=0; j<n; j++) { 1649 rval = ib[j]; 1650 z[rval] += PetscConj(*v) * x1; 1651 v++; 1652 } 1653 if (!usecprow) xb++; 1654 } 1655 break; 1656 case 2: 1657 for (i=0; i<mbs; i++) { 1658 if (usecprow) xb = x + 2*ridx[i]; 1659 x1 = xb[0]; x2 = xb[1]; 1660 ib = idx + ii[0]; 1661 n = ii[1] - ii[0]; ii++; 1662 for (j=0; j<n; j++) { 1663 rval = ib[j]*2; 1664 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1665 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1666 v += 4; 1667 } 1668 if (!usecprow) xb += 2; 1669 } 1670 break; 1671 case 3: 1672 for (i=0; i<mbs; i++) { 1673 if (usecprow) xb = x + 3*ridx[i]; 1674 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1675 ib = idx + ii[0]; 1676 n = ii[1] - ii[0]; ii++; 1677 for (j=0; j<n; j++) { 1678 rval = ib[j]*3; 1679 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1680 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1681 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1682 v += 9; 1683 } 1684 if (!usecprow) xb += 3; 1685 } 1686 break; 1687 case 4: 1688 for (i=0; i<mbs; i++) { 1689 if (usecprow) xb = x + 4*ridx[i]; 1690 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1691 ib = idx + ii[0]; 1692 n = ii[1] - ii[0]; ii++; 1693 for (j=0; j<n; j++) { 1694 rval = ib[j]*4; 1695 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1696 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1697 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1698 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1699 v += 16; 1700 } 1701 if (!usecprow) xb += 4; 1702 } 1703 break; 1704 case 5: 1705 for (i=0; i<mbs; i++) { 1706 if (usecprow) xb = x + 5*ridx[i]; 1707 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1708 x4 = xb[3]; x5 = xb[4]; 1709 ib = idx + ii[0]; 1710 n = ii[1] - ii[0]; ii++; 1711 for (j=0; j<n; j++) { 1712 rval = ib[j]*5; 1713 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1714 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1715 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1716 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1717 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1718 v += 25; 1719 } 1720 if (!usecprow) xb += 5; 1721 } 1722 break; 1723 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1724 PetscInt ncols,k; 1725 PetscScalar *work,*workt,*xtmp; 1726 1727 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1728 if (!a->mult_work) { 1729 k = PetscMax(A->rmap->n,A->cmap->n); 1730 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1731 } 1732 work = a->mult_work; 1733 xtmp = x; 1734 for (i=0; i<mbs; i++) { 1735 n = ii[1] - ii[0]; ii++; 1736 ncols = n*bs; 1737 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1738 if (usecprow) xtmp = x + bs*ridx[i]; 1739 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1740 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1741 v += n*bs2; 1742 if (!usecprow) xtmp += bs; 1743 workt = work; 1744 for (j=0; j<n; j++) { 1745 zb = z + bs*(*idx++); 1746 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1747 workt += bs; 1748 } 1749 } 1750 } 1751 } 1752 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1753 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1754 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1755 PetscFunctionReturn(0); 1756 } 1757 1758 #undef __FUNCT__ 1759 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1760 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1761 { 1762 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1763 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1764 MatScalar *v; 1765 PetscErrorCode ierr; 1766 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1767 Mat_CompressedRow cprow = a->compressedrow; 1768 PetscBool usecprow=cprow.use; 1769 1770 PetscFunctionBegin; 1771 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1772 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1773 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1774 1775 idx = a->j; 1776 v = a->a; 1777 if (usecprow) { 1778 mbs = cprow.nrows; 1779 ii = cprow.i; 1780 ridx = cprow.rindex; 1781 } else { 1782 mbs=a->mbs; 1783 ii = a->i; 1784 xb = x; 1785 } 1786 1787 switch (bs) { 1788 case 1: 1789 for (i=0; i<mbs; i++) { 1790 if (usecprow) xb = x + ridx[i]; 1791 x1 = xb[0]; 1792 ib = idx + ii[0]; 1793 n = ii[1] - ii[0]; ii++; 1794 for (j=0; j<n; j++) { 1795 rval = ib[j]; 1796 z[rval] += *v * x1; 1797 v++; 1798 } 1799 if (!usecprow) xb++; 1800 } 1801 break; 1802 case 2: 1803 for (i=0; i<mbs; i++) { 1804 if (usecprow) xb = x + 2*ridx[i]; 1805 x1 = xb[0]; x2 = xb[1]; 1806 ib = idx + ii[0]; 1807 n = ii[1] - ii[0]; ii++; 1808 for (j=0; j<n; j++) { 1809 rval = ib[j]*2; 1810 z[rval++] += v[0]*x1 + v[1]*x2; 1811 z[rval++] += v[2]*x1 + v[3]*x2; 1812 v += 4; 1813 } 1814 if (!usecprow) xb += 2; 1815 } 1816 break; 1817 case 3: 1818 for (i=0; i<mbs; i++) { 1819 if (usecprow) xb = x + 3*ridx[i]; 1820 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1821 ib = idx + ii[0]; 1822 n = ii[1] - ii[0]; ii++; 1823 for (j=0; j<n; j++) { 1824 rval = ib[j]*3; 1825 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1826 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1827 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1828 v += 9; 1829 } 1830 if (!usecprow) xb += 3; 1831 } 1832 break; 1833 case 4: 1834 for (i=0; i<mbs; i++) { 1835 if (usecprow) xb = x + 4*ridx[i]; 1836 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1837 ib = idx + ii[0]; 1838 n = ii[1] - ii[0]; ii++; 1839 for (j=0; j<n; j++) { 1840 rval = ib[j]*4; 1841 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1842 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1843 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1844 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1845 v += 16; 1846 } 1847 if (!usecprow) xb += 4; 1848 } 1849 break; 1850 case 5: 1851 for (i=0; i<mbs; i++) { 1852 if (usecprow) xb = x + 5*ridx[i]; 1853 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1854 x4 = xb[3]; x5 = xb[4]; 1855 ib = idx + ii[0]; 1856 n = ii[1] - ii[0]; ii++; 1857 for (j=0; j<n; j++) { 1858 rval = ib[j]*5; 1859 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1860 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1861 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1862 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1863 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1864 v += 25; 1865 } 1866 if (!usecprow) xb += 5; 1867 } 1868 break; 1869 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1870 PetscInt ncols,k; 1871 PetscScalar *work,*workt,*xtmp; 1872 1873 if (!a->mult_work) { 1874 k = PetscMax(A->rmap->n,A->cmap->n); 1875 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1876 } 1877 work = a->mult_work; 1878 xtmp = x; 1879 for (i=0; i<mbs; i++) { 1880 n = ii[1] - ii[0]; ii++; 1881 ncols = n*bs; 1882 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1883 if (usecprow) xtmp = x + bs*ridx[i]; 1884 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1885 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1886 v += n*bs2; 1887 if (!usecprow) xtmp += bs; 1888 workt = work; 1889 for (j=0; j<n; j++) { 1890 zb = z + bs*(*idx++); 1891 for (k=0; k<bs; k++) zb[k] += workt[k]; 1892 workt += bs; 1893 } 1894 } 1895 } 1896 } 1897 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1898 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1899 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1900 PetscFunctionReturn(0); 1901 } 1902 1903 #undef __FUNCT__ 1904 #define __FUNCT__ "MatScale_SeqBAIJ" 1905 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1906 { 1907 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1908 PetscInt totalnz = a->bs2*a->nz; 1909 PetscScalar oalpha = alpha; 1910 PetscErrorCode ierr; 1911 PetscBLASInt one = 1,tnz; 1912 1913 PetscFunctionBegin; 1914 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1915 BLASscal_(&tnz,&oalpha,a->a,&one); 1916 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1917 PetscFunctionReturn(0); 1918 } 1919 1920 #undef __FUNCT__ 1921 #define __FUNCT__ "MatNorm_SeqBAIJ" 1922 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1923 { 1924 PetscErrorCode ierr; 1925 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1926 MatScalar *v = a->a; 1927 PetscReal sum = 0.0; 1928 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1929 1930 PetscFunctionBegin; 1931 if (type == NORM_FROBENIUS) { 1932 for (i=0; i< bs2*nz; i++) { 1933 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1934 } 1935 *norm = PetscSqrtReal(sum); 1936 } else if (type == NORM_1) { /* maximum column sum */ 1937 PetscReal *tmp; 1938 PetscInt *bcol = a->j; 1939 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1940 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1941 for (i=0; i<nz; i++) { 1942 for (j=0; j<bs; j++) { 1943 k1 = bs*(*bcol) + j; /* column index */ 1944 for (k=0; k<bs; k++) { 1945 tmp[k1] += PetscAbsScalar(*v); v++; 1946 } 1947 } 1948 bcol++; 1949 } 1950 *norm = 0.0; 1951 for (j=0; j<A->cmap->n; j++) { 1952 if (tmp[j] > *norm) *norm = tmp[j]; 1953 } 1954 ierr = PetscFree(tmp);CHKERRQ(ierr); 1955 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1956 *norm = 0.0; 1957 for (k=0; k<bs; k++) { 1958 for (j=0; j<a->mbs; j++) { 1959 v = a->a + bs2*a->i[j] + k; 1960 sum = 0.0; 1961 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1962 for (k1=0; k1<bs; k1++) { 1963 sum += PetscAbsScalar(*v); 1964 v += bs; 1965 } 1966 } 1967 if (sum > *norm) *norm = sum; 1968 } 1969 } 1970 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1971 PetscFunctionReturn(0); 1972 } 1973 1974 1975 #undef __FUNCT__ 1976 #define __FUNCT__ "MatEqual_SeqBAIJ" 1977 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1978 { 1979 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 1980 PetscErrorCode ierr; 1981 1982 PetscFunctionBegin; 1983 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1984 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1985 *flg = PETSC_FALSE; 1986 PetscFunctionReturn(0); 1987 } 1988 1989 /* if the a->i are the same */ 1990 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1991 if (!*flg) PetscFunctionReturn(0); 1992 1993 /* if a->j are the same */ 1994 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1995 if (!*flg) PetscFunctionReturn(0); 1996 1997 /* if a->a are the same */ 1998 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1999 PetscFunctionReturn(0); 2000 2001 } 2002 2003 #undef __FUNCT__ 2004 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 2005 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2006 { 2007 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2008 PetscErrorCode ierr; 2009 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2010 PetscScalar *x,zero = 0.0; 2011 MatScalar *aa,*aa_j; 2012 2013 PetscFunctionBegin; 2014 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2015 bs = A->rmap->bs; 2016 aa = a->a; 2017 ai = a->i; 2018 aj = a->j; 2019 ambs = a->mbs; 2020 bs2 = a->bs2; 2021 2022 ierr = VecSet(v,zero);CHKERRQ(ierr); 2023 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2024 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2025 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2026 for (i=0; i<ambs; i++) { 2027 for (j=ai[i]; j<ai[i+1]; j++) { 2028 if (aj[j] == i) { 2029 row = i*bs; 2030 aa_j = aa+j*bs2; 2031 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2032 break; 2033 } 2034 } 2035 } 2036 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2037 PetscFunctionReturn(0); 2038 } 2039 2040 #undef __FUNCT__ 2041 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2042 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2043 { 2044 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2045 const PetscScalar *l,*r,*li,*ri; 2046 PetscScalar x; 2047 MatScalar *aa, *v; 2048 PetscErrorCode ierr; 2049 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2050 const PetscInt *ai,*aj; 2051 2052 PetscFunctionBegin; 2053 ai = a->i; 2054 aj = a->j; 2055 aa = a->a; 2056 m = A->rmap->n; 2057 n = A->cmap->n; 2058 bs = A->rmap->bs; 2059 mbs = a->mbs; 2060 bs2 = a->bs2; 2061 if (ll) { 2062 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2063 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2064 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2065 for (i=0; i<mbs; i++) { /* for each block row */ 2066 M = ai[i+1] - ai[i]; 2067 li = l + i*bs; 2068 v = aa + bs2*ai[i]; 2069 for (j=0; j<M; j++) { /* for each block */ 2070 for (k=0; k<bs2; k++) { 2071 (*v++) *= li[k%bs]; 2072 } 2073 } 2074 } 2075 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2076 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2077 } 2078 2079 if (rr) { 2080 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2081 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2082 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2083 for (i=0; i<mbs; i++) { /* for each block row */ 2084 iai = ai[i]; 2085 M = ai[i+1] - iai; 2086 v = aa + bs2*iai; 2087 for (j=0; j<M; j++) { /* for each block */ 2088 ri = r + bs*aj[iai+j]; 2089 for (k=0; k<bs; k++) { 2090 x = ri[k]; 2091 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2092 v += bs; 2093 } 2094 } 2095 } 2096 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2097 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2098 } 2099 PetscFunctionReturn(0); 2100 } 2101 2102 2103 #undef __FUNCT__ 2104 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2105 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2106 { 2107 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2108 2109 PetscFunctionBegin; 2110 info->block_size = a->bs2; 2111 info->nz_allocated = a->bs2*a->maxnz; 2112 info->nz_used = a->bs2*a->nz; 2113 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2114 info->assemblies = A->num_ass; 2115 info->mallocs = A->info.mallocs; 2116 info->memory = ((PetscObject)A)->mem; 2117 if (A->factortype) { 2118 info->fill_ratio_given = A->info.fill_ratio_given; 2119 info->fill_ratio_needed = A->info.fill_ratio_needed; 2120 info->factor_mallocs = A->info.factor_mallocs; 2121 } else { 2122 info->fill_ratio_given = 0; 2123 info->fill_ratio_needed = 0; 2124 info->factor_mallocs = 0; 2125 } 2126 PetscFunctionReturn(0); 2127 } 2128 2129 2130 #undef __FUNCT__ 2131 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2132 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2133 { 2134 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2135 PetscErrorCode ierr; 2136 2137 PetscFunctionBegin; 2138 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2139 PetscFunctionReturn(0); 2140 } 2141