1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 27 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n ; ++j) { 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if (!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k<n ; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end ; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) 64 nidx2[j*bs+k] = nidx[j]*bs+k; 65 } 66 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 67 } 68 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 69 ierr = PetscFree(nidx);CHKERRQ(ierr); 70 ierr = PetscFree(nidx2);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 #undef __FUNCT__ 75 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 76 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscBool flag,sorted; 88 89 PetscFunctionBegin; 90 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 91 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 92 93 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 94 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 95 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 97 98 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 99 ssmap = smap; 100 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 101 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 102 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 103 /* determine lens of each row */ 104 for (i=0; i<nrows; i++) { 105 kstart = ai[irow[i]]; 106 kend = kstart + a->ilen[irow[i]]; 107 lens[i] = 0; 108 for (k=kstart; k<kend; k++) { 109 if (ssmap[aj[k]]) { 110 lens[i]++; 111 } 112 } 113 } 114 /* Create and fill new matrix */ 115 if (scall == MAT_REUSE_MATRIX) { 116 c = (Mat_SeqBAIJ *)((*B)->data); 117 118 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 119 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 120 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 121 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 122 C = *B; 123 } else { 124 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 125 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 126 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 127 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 128 } 129 c = (Mat_SeqBAIJ *)(C->data); 130 for (i=0; i<nrows; i++) { 131 row = irow[i]; 132 kstart = ai[row]; 133 kend = kstart + a->ilen[row]; 134 mat_i = c->i[i]; 135 mat_j = c->j + mat_i; 136 mat_a = c->a + mat_i*bs2; 137 mat_ilen = c->ilen + i; 138 for (k=kstart; k<kend; k++) { 139 if ((tcol=ssmap[a->j[k]])) { 140 *mat_j++ = tcol - 1; 141 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 142 mat_a += bs2; 143 (*mat_ilen)++; 144 } 145 } 146 } 147 148 /* Free work space */ 149 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 150 ierr = PetscFree(smap);CHKERRQ(ierr); 151 ierr = PetscFree(lens);CHKERRQ(ierr); 152 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 153 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 154 155 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 156 *B = C; 157 PetscFunctionReturn(0); 158 } 159 160 #undef __FUNCT__ 161 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 162 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 163 { 164 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 165 IS is1,is2; 166 PetscErrorCode ierr; 167 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 168 const PetscInt *irow,*icol; 169 170 PetscFunctionBegin; 171 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 172 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 173 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 175 176 /* Verify if the indices corespond to each element in a block 177 and form the IS with compressed IS */ 178 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 179 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 180 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 181 count = 0; 182 for (i=0; i<a->mbs; i++) { 183 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 184 if (vary[i]==bs) iary[count++] = i; 185 } 186 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 187 188 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 189 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 190 count = 0; 191 for (i=0; i<a->mbs; i++) { 192 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 193 if (vary[i]==bs) iary[count++] = i; 194 } 195 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 196 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 197 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 198 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 199 200 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 201 ierr = ISDestroy(&is1);CHKERRQ(ierr); 202 ierr = ISDestroy(&is2);CHKERRQ(ierr); 203 PetscFunctionReturn(0); 204 } 205 206 #undef __FUNCT__ 207 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 208 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 209 { 210 PetscErrorCode ierr; 211 PetscInt i; 212 213 PetscFunctionBegin; 214 if (scall == MAT_INITIAL_MATRIX) { 215 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 216 } 217 218 for (i=0; i<n; i++) { 219 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 220 } 221 PetscFunctionReturn(0); 222 } 223 224 225 /* -------------------------------------------------------*/ 226 /* Should check that shapes of vectors and matrices match */ 227 /* -------------------------------------------------------*/ 228 229 #undef __FUNCT__ 230 #define __FUNCT__ "MatMult_SeqBAIJ_1" 231 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 232 { 233 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 234 PetscScalar *z,sum; 235 const PetscScalar *x; 236 const MatScalar *v; 237 PetscErrorCode ierr; 238 PetscInt mbs,i,n,nonzerorow=0; 239 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 240 PetscBool usecprow=a->compressedrow.use; 241 242 PetscFunctionBegin; 243 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 244 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 245 246 if (usecprow) { 247 mbs = a->compressedrow.nrows; 248 ii = a->compressedrow.i; 249 ridx = a->compressedrow.rindex; 250 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 251 } else { 252 mbs = a->mbs; 253 ii = a->i; 254 } 255 256 for (i=0; i<mbs; i++) { 257 n = ii[1] - ii[0]; 258 v = a->a + ii[0]; 259 idx = a->j + ii[0]; 260 ii++; 261 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 262 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 263 sum = 0.0; 264 PetscSparseDensePlusDot(sum,x,v,idx,n); 265 if (usecprow) { 266 z[ridx[i]] = sum; 267 } else { 268 nonzerorow += (n>0); 269 z[i] = sum; 270 } 271 } 272 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 273 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 274 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 275 PetscFunctionReturn(0); 276 } 277 278 #undef __FUNCT__ 279 #define __FUNCT__ "MatMult_SeqBAIJ_2" 280 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 281 { 282 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 283 PetscScalar *z = 0,sum1,sum2,*zarray; 284 const PetscScalar *x,*xb; 285 PetscScalar x1,x2; 286 const MatScalar *v; 287 PetscErrorCode ierr; 288 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 289 PetscBool usecprow=a->compressedrow.use; 290 291 PetscFunctionBegin; 292 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 293 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 294 295 idx = a->j; 296 v = a->a; 297 if (usecprow) { 298 mbs = a->compressedrow.nrows; 299 ii = a->compressedrow.i; 300 ridx = a->compressedrow.rindex; 301 } else { 302 mbs = a->mbs; 303 ii = a->i; 304 z = zarray; 305 } 306 307 for (i=0; i<mbs; i++) { 308 n = ii[1] - ii[0]; ii++; 309 sum1 = 0.0; sum2 = 0.0; 310 nonzerorow += (n>0); 311 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 312 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 313 for (j=0; j<n; j++) { 314 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 315 sum1 += v[0]*x1 + v[2]*x2; 316 sum2 += v[1]*x1 + v[3]*x2; 317 v += 4; 318 } 319 if (usecprow) z = zarray + 2*ridx[i]; 320 z[0] = sum1; z[1] = sum2; 321 if (!usecprow) z += 2; 322 } 323 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 324 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 325 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 326 PetscFunctionReturn(0); 327 } 328 329 #undef __FUNCT__ 330 #define __FUNCT__ "MatMult_SeqBAIJ_3" 331 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 332 { 333 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 334 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 335 const PetscScalar *x,*xb; 336 const MatScalar *v; 337 PetscErrorCode ierr; 338 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 339 PetscBool usecprow=a->compressedrow.use; 340 341 342 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 343 #pragma disjoint(*v,*z,*xb) 344 #endif 345 346 PetscFunctionBegin; 347 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 348 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 349 350 idx = a->j; 351 v = a->a; 352 if (usecprow) { 353 mbs = a->compressedrow.nrows; 354 ii = a->compressedrow.i; 355 ridx = a->compressedrow.rindex; 356 } else { 357 mbs = a->mbs; 358 ii = a->i; 359 z = zarray; 360 } 361 362 for (i=0; i<mbs; i++) { 363 n = ii[1] - ii[0]; ii++; 364 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 365 nonzerorow += (n>0); 366 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 367 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 368 for (j=0; j<n; j++) { 369 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 370 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 371 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 372 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 373 v += 9; 374 } 375 if (usecprow) z = zarray + 3*ridx[i]; 376 z[0] = sum1; z[1] = sum2; z[2] = sum3; 377 if (!usecprow) z += 3; 378 } 379 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 380 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 381 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 #undef __FUNCT__ 386 #define __FUNCT__ "MatMult_SeqBAIJ_4" 387 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 388 { 389 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 390 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 391 const PetscScalar *x,*xb; 392 const MatScalar *v; 393 PetscErrorCode ierr; 394 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 395 PetscBool usecprow=a->compressedrow.use; 396 397 PetscFunctionBegin; 398 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 399 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 400 401 idx = a->j; 402 v = a->a; 403 if (usecprow) { 404 mbs = a->compressedrow.nrows; 405 ii = a->compressedrow.i; 406 ridx = a->compressedrow.rindex; 407 } else { 408 mbs = a->mbs; 409 ii = a->i; 410 z = zarray; 411 } 412 413 for (i=0; i<mbs; i++) { 414 n = ii[1] - ii[0]; ii++; 415 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 416 nonzerorow += (n>0); 417 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 418 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 419 for (j=0; j<n; j++) { 420 xb = x + 4*(*idx++); 421 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 422 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 423 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 424 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 425 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 426 v += 16; 427 } 428 if (usecprow) z = zarray + 4*ridx[i]; 429 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 430 if (!usecprow) z += 4; 431 } 432 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 433 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 434 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 435 PetscFunctionReturn(0); 436 } 437 438 #undef __FUNCT__ 439 #define __FUNCT__ "MatMult_SeqBAIJ_5" 440 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 441 { 442 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 443 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 444 const PetscScalar *xb,*x; 445 const MatScalar *v; 446 PetscErrorCode ierr; 447 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 448 PetscInt mbs,i,j,n,nonzerorow=0; 449 PetscBool usecprow=a->compressedrow.use; 450 451 PetscFunctionBegin; 452 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 453 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 454 455 idx = a->j; 456 v = a->a; 457 if (usecprow) { 458 mbs = a->compressedrow.nrows; 459 ii = a->compressedrow.i; 460 ridx = a->compressedrow.rindex; 461 } else { 462 mbs = a->mbs; 463 ii = a->i; 464 z = zarray; 465 } 466 467 for (i=0; i<mbs; i++) { 468 n = ii[1] - ii[0]; ii++; 469 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 470 nonzerorow += (n>0); 471 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 472 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 473 for (j=0; j<n; j++) { 474 xb = x + 5*(*idx++); 475 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 476 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 477 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 478 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 479 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 480 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 481 v += 25; 482 } 483 if (usecprow) z = zarray + 5*ridx[i]; 484 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 485 if (!usecprow) z += 5; 486 } 487 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 488 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 489 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 490 PetscFunctionReturn(0); 491 } 492 493 494 #undef __FUNCT__ 495 #define __FUNCT__ "MatMult_SeqBAIJ_6" 496 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 497 { 498 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 499 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 500 const PetscScalar *x,*xb; 501 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 502 const MatScalar *v; 503 PetscErrorCode ierr; 504 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 505 PetscBool usecprow=a->compressedrow.use; 506 507 PetscFunctionBegin; 508 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 509 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 510 511 idx = a->j; 512 v = a->a; 513 if (usecprow) { 514 mbs = a->compressedrow.nrows; 515 ii = a->compressedrow.i; 516 ridx = a->compressedrow.rindex; 517 } else { 518 mbs = a->mbs; 519 ii = a->i; 520 z = zarray; 521 } 522 523 for (i=0; i<mbs; i++) { 524 n = ii[1] - ii[0]; ii++; 525 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 526 nonzerorow += (n>0); 527 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 528 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 529 for (j=0; j<n; j++) { 530 xb = x + 6*(*idx++); 531 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 532 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 533 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 534 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 535 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 536 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 537 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 538 v += 36; 539 } 540 if (usecprow) z = zarray + 6*ridx[i]; 541 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 542 if (!usecprow) z += 6; 543 } 544 545 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 546 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 547 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 548 PetscFunctionReturn(0); 549 } 550 551 #undef __FUNCT__ 552 #define __FUNCT__ "MatMult_SeqBAIJ_7" 553 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 554 { 555 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 556 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 557 const PetscScalar *x,*xb; 558 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 559 const MatScalar *v; 560 PetscErrorCode ierr; 561 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 562 PetscBool usecprow=a->compressedrow.use; 563 564 PetscFunctionBegin; 565 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 566 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 567 568 idx = a->j; 569 v = a->a; 570 if (usecprow) { 571 mbs = a->compressedrow.nrows; 572 ii = a->compressedrow.i; 573 ridx = a->compressedrow.rindex; 574 } else { 575 mbs = a->mbs; 576 ii = a->i; 577 z = zarray; 578 } 579 580 for (i=0; i<mbs; i++) { 581 n = ii[1] - ii[0]; ii++; 582 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 583 nonzerorow += (n>0); 584 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 585 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 586 for (j=0; j<n; j++) { 587 xb = x + 7*(*idx++); 588 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 589 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 590 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 591 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 592 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 593 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 594 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 595 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 596 v += 49; 597 } 598 if (usecprow) z = zarray + 7*ridx[i]; 599 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 600 if (!usecprow) z += 7; 601 } 602 603 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 604 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 605 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 606 PetscFunctionReturn(0); 607 } 608 609 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 610 /* Default MatMult for block size 15 */ 611 612 #undef __FUNCT__ 613 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 614 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 615 { 616 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 617 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 618 const PetscScalar *x,*xb; 619 PetscScalar *zarray,xv; 620 const MatScalar *v; 621 PetscErrorCode ierr; 622 const PetscInt *ii,*ij=a->j,*idx; 623 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 624 PetscBool usecprow=a->compressedrow.use; 625 626 PetscFunctionBegin; 627 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 628 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 629 630 v = a->a; 631 if (usecprow) { 632 mbs = a->compressedrow.nrows; 633 ii = a->compressedrow.i; 634 ridx = a->compressedrow.rindex; 635 } else { 636 mbs = a->mbs; 637 ii = a->i; 638 z = zarray; 639 } 640 641 for (i=0; i<mbs; i++) { 642 n = ii[i+1] - ii[i]; 643 idx = ij + ii[i]; 644 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 645 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 646 647 nonzerorow += (n>0); 648 for (j=0; j<n; j++) { 649 xb = x + 15*(idx[j]); 650 651 for (k=0;k<15;k++) { 652 xv = xb[k]; 653 sum1 += v[0]*xv; 654 sum2 += v[1]*xv; 655 sum3 += v[2]*xv; 656 sum4 += v[3]*xv; 657 sum5 += v[4]*xv; 658 sum6 += v[5]*xv; 659 sum7 += v[6]*xv; 660 sum8 += v[7]*xv; 661 sum9 += v[8]*xv; 662 sum10 += v[9]*xv; 663 sum11 += v[10]*xv; 664 sum12 += v[11]*xv; 665 sum13 += v[12]*xv; 666 sum14 += v[13]*xv; 667 sum15 += v[14]*xv; 668 v += 15; 669 } 670 } 671 if (usecprow) z = zarray + 15*ridx[i]; 672 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 673 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 674 675 if (!usecprow) z += 15; 676 } 677 678 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 679 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 680 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 681 PetscFunctionReturn(0); 682 } 683 684 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 685 #undef __FUNCT__ 686 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 687 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 688 { 689 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 690 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 691 const PetscScalar *x,*xb; 692 PetscScalar x1,x2,x3,x4,*zarray; 693 const MatScalar *v; 694 PetscErrorCode ierr; 695 const PetscInt *ii,*ij=a->j,*idx; 696 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 697 PetscBool usecprow=a->compressedrow.use; 698 699 PetscFunctionBegin; 700 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 701 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 702 703 v = a->a; 704 if (usecprow) { 705 mbs = a->compressedrow.nrows; 706 ii = a->compressedrow.i; 707 ridx = a->compressedrow.rindex; 708 } else { 709 mbs = a->mbs; 710 ii = a->i; 711 z = zarray; 712 } 713 714 for (i=0; i<mbs; i++) { 715 n = ii[i+1] - ii[i]; 716 idx = ij + ii[i]; 717 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 718 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 719 720 nonzerorow += (n>0); 721 for (j=0; j<n; j++) { 722 xb = x + 15*(idx[j]); 723 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 724 725 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 726 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 727 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 728 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 729 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 730 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 731 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 732 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 733 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 734 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 735 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 736 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 737 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 738 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 739 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 740 741 v += 60; 742 743 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 744 745 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 746 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 747 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 748 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 749 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 750 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 751 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 752 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 753 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 754 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 755 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 756 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 757 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 758 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 759 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 760 v += 60; 761 762 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 763 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 764 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 765 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 766 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 767 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 768 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 769 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 770 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 771 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 772 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 773 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 774 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 775 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 776 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 777 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 778 v += 60; 779 780 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 781 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 782 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 783 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 784 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 785 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 786 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 787 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 788 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 789 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 790 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 791 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 792 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 793 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 794 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 795 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 796 v += 45; 797 } 798 if (usecprow) z = zarray + 15*ridx[i]; 799 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 800 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 801 802 if (!usecprow) z += 15; 803 } 804 805 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 806 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 807 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 808 PetscFunctionReturn(0); 809 } 810 811 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 812 #undef __FUNCT__ 813 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 814 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 815 { 816 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 817 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 818 const PetscScalar *x,*xb; 819 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 820 const MatScalar *v; 821 PetscErrorCode ierr; 822 const PetscInt *ii,*ij=a->j,*idx; 823 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 824 PetscBool usecprow=a->compressedrow.use; 825 826 PetscFunctionBegin; 827 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 828 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 829 830 v = a->a; 831 if (usecprow) { 832 mbs = a->compressedrow.nrows; 833 ii = a->compressedrow.i; 834 ridx = a->compressedrow.rindex; 835 } else { 836 mbs = a->mbs; 837 ii = a->i; 838 z = zarray; 839 } 840 841 for (i=0; i<mbs; i++) { 842 n = ii[i+1] - ii[i]; 843 idx = ij + ii[i]; 844 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 845 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 846 847 nonzerorow += (n>0); 848 for (j=0; j<n; j++) { 849 xb = x + 15*(idx[j]); 850 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 851 x8 = xb[7]; 852 853 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 854 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 855 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 856 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 857 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 858 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 859 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 860 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 861 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 862 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 863 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 864 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 865 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 866 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 867 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 868 v += 120; 869 870 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 871 872 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 873 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 874 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 875 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 876 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 877 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 878 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 879 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 880 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 881 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 882 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 883 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 884 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 885 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 886 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 887 v += 105; 888 } 889 if (usecprow) z = zarray + 15*ridx[i]; 890 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 891 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 892 893 if (!usecprow) z += 15; 894 } 895 896 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 897 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 898 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 899 PetscFunctionReturn(0); 900 } 901 902 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 903 904 #undef __FUNCT__ 905 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 906 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 907 { 908 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 909 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 910 const PetscScalar *x,*xb; 911 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 912 const MatScalar *v; 913 PetscErrorCode ierr; 914 const PetscInt *ii,*ij=a->j,*idx; 915 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 916 PetscBool usecprow=a->compressedrow.use; 917 918 PetscFunctionBegin; 919 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 920 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 921 922 v = a->a; 923 if (usecprow) { 924 mbs = a->compressedrow.nrows; 925 ii = a->compressedrow.i; 926 ridx = a->compressedrow.rindex; 927 } else { 928 mbs = a->mbs; 929 ii = a->i; 930 z = zarray; 931 } 932 933 for (i=0; i<mbs; i++) { 934 n = ii[i+1] - ii[i]; 935 idx = ij + ii[i]; 936 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 937 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 938 939 nonzerorow += (n>0); 940 for (j=0; j<n; j++) { 941 xb = x + 15*(idx[j]); 942 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 943 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 944 945 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 946 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 947 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 948 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 949 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 950 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 951 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 952 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 953 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 954 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 955 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 956 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 957 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 958 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 959 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 960 v += 225; 961 } 962 if (usecprow) z = zarray + 15*ridx[i]; 963 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 964 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 965 966 if (!usecprow) z += 15; 967 } 968 969 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 970 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 971 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 972 PetscFunctionReturn(0); 973 } 974 975 976 /* 977 This will not work with MatScalar == float because it calls the BLAS 978 */ 979 #undef __FUNCT__ 980 #define __FUNCT__ "MatMult_SeqBAIJ_N" 981 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 982 { 983 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 984 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 985 MatScalar *v; 986 PetscErrorCode ierr; 987 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 988 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 989 PetscBool usecprow=a->compressedrow.use; 990 991 PetscFunctionBegin; 992 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 993 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 994 995 idx = a->j; 996 v = a->a; 997 if (usecprow) { 998 mbs = a->compressedrow.nrows; 999 ii = a->compressedrow.i; 1000 ridx = a->compressedrow.rindex; 1001 } else { 1002 mbs = a->mbs; 1003 ii = a->i; 1004 z = zarray; 1005 } 1006 1007 if (!a->mult_work) { 1008 k = PetscMax(A->rmap->n,A->cmap->n); 1009 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1010 } 1011 work = a->mult_work; 1012 for (i=0; i<mbs; i++) { 1013 n = ii[1] - ii[0]; ii++; 1014 ncols = n*bs; 1015 workt = work; 1016 nonzerorow += (n>0); 1017 for (j=0; j<n; j++) { 1018 xb = x + bs*(*idx++); 1019 for (k=0; k<bs; k++) workt[k] = xb[k]; 1020 workt += bs; 1021 } 1022 if (usecprow) z = zarray + bs*ridx[i]; 1023 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1024 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1025 v += n*bs2; 1026 if (!usecprow) z += bs; 1027 } 1028 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1029 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1030 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1031 PetscFunctionReturn(0); 1032 } 1033 1034 #undef __FUNCT__ 1035 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1036 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1037 { 1038 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1039 const PetscScalar *x; 1040 PetscScalar *y,*z,sum; 1041 const MatScalar *v; 1042 PetscErrorCode ierr; 1043 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1044 const PetscInt *idx,*ii; 1045 PetscBool usecprow=a->compressedrow.use; 1046 1047 PetscFunctionBegin; 1048 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1049 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1050 if (zz != yy) { 1051 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1052 } else { 1053 z = y; 1054 } 1055 1056 idx = a->j; 1057 v = a->a; 1058 if (usecprow) { 1059 if (zz != yy) { 1060 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1061 } 1062 mbs = a->compressedrow.nrows; 1063 ii = a->compressedrow.i; 1064 ridx = a->compressedrow.rindex; 1065 } else { 1066 ii = a->i; 1067 } 1068 1069 for (i=0; i<mbs; i++) { 1070 n = ii[1] - ii[0]; 1071 ii++; 1072 if (!usecprow) { 1073 nonzerorow += (n>0); 1074 sum = y[i]; 1075 } else { 1076 sum = y[ridx[i]]; 1077 } 1078 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1079 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1080 PetscSparseDensePlusDot(sum,x,v,idx,n); 1081 v += n; 1082 idx += n; 1083 if (usecprow) { 1084 z[ridx[i]] = sum; 1085 } else { 1086 z[i] = sum; 1087 } 1088 } 1089 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1091 if (zz != yy) { 1092 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1093 } 1094 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1095 PetscFunctionReturn(0); 1096 } 1097 1098 #undef __FUNCT__ 1099 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1100 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1101 { 1102 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1103 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1104 PetscScalar x1,x2,*yarray,*zarray; 1105 MatScalar *v; 1106 PetscErrorCode ierr; 1107 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1108 PetscBool usecprow=a->compressedrow.use; 1109 1110 PetscFunctionBegin; 1111 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1112 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1113 if (zz != yy) { 1114 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1115 } else { 1116 zarray = yarray; 1117 } 1118 1119 idx = a->j; 1120 v = a->a; 1121 if (usecprow) { 1122 if (zz != yy) { 1123 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1124 } 1125 mbs = a->compressedrow.nrows; 1126 ii = a->compressedrow.i; 1127 ridx = a->compressedrow.rindex; 1128 if (zz != yy) { 1129 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1130 } 1131 } else { 1132 ii = a->i; 1133 y = yarray; 1134 z = zarray; 1135 } 1136 1137 for (i=0; i<mbs; i++) { 1138 n = ii[1] - ii[0]; ii++; 1139 if (usecprow) { 1140 z = zarray + 2*ridx[i]; 1141 y = yarray + 2*ridx[i]; 1142 } 1143 sum1 = y[0]; sum2 = y[1]; 1144 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1145 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1146 for (j=0; j<n; j++) { 1147 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1148 sum1 += v[0]*x1 + v[2]*x2; 1149 sum2 += v[1]*x1 + v[3]*x2; 1150 v += 4; 1151 } 1152 z[0] = sum1; z[1] = sum2; 1153 if (!usecprow) { 1154 z += 2; y += 2; 1155 } 1156 } 1157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1158 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1159 if (zz != yy) { 1160 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1161 } 1162 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1168 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1169 { 1170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1171 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1172 MatScalar *v; 1173 PetscErrorCode ierr; 1174 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1175 PetscBool usecprow=a->compressedrow.use; 1176 1177 PetscFunctionBegin; 1178 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1179 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1180 if (zz != yy) { 1181 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1182 } else { 1183 zarray = yarray; 1184 } 1185 1186 idx = a->j; 1187 v = a->a; 1188 if (usecprow) { 1189 if (zz != yy) { 1190 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1191 } 1192 mbs = a->compressedrow.nrows; 1193 ii = a->compressedrow.i; 1194 ridx = a->compressedrow.rindex; 1195 } else { 1196 ii = a->i; 1197 y = yarray; 1198 z = zarray; 1199 } 1200 1201 for (i=0; i<mbs; i++) { 1202 n = ii[1] - ii[0]; ii++; 1203 if (usecprow) { 1204 z = zarray + 3*ridx[i]; 1205 y = yarray + 3*ridx[i]; 1206 } 1207 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1208 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1209 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1210 for (j=0; j<n; j++) { 1211 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1212 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1213 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1214 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1215 v += 9; 1216 } 1217 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1218 if (!usecprow) { 1219 z += 3; y += 3; 1220 } 1221 } 1222 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1223 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1224 if (zz != yy) { 1225 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1226 } 1227 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1228 PetscFunctionReturn(0); 1229 } 1230 1231 #undef __FUNCT__ 1232 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1233 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1234 { 1235 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1236 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1237 MatScalar *v; 1238 PetscErrorCode ierr; 1239 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1240 PetscBool usecprow=a->compressedrow.use; 1241 1242 PetscFunctionBegin; 1243 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1244 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1245 if (zz != yy) { 1246 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1247 } else { 1248 zarray = yarray; 1249 } 1250 1251 idx = a->j; 1252 v = a->a; 1253 if (usecprow) { 1254 if (zz != yy) { 1255 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1256 } 1257 mbs = a->compressedrow.nrows; 1258 ii = a->compressedrow.i; 1259 ridx = a->compressedrow.rindex; 1260 } else { 1261 ii = a->i; 1262 y = yarray; 1263 z = zarray; 1264 } 1265 1266 for (i=0; i<mbs; i++) { 1267 n = ii[1] - ii[0]; ii++; 1268 if (usecprow) { 1269 z = zarray + 4*ridx[i]; 1270 y = yarray + 4*ridx[i]; 1271 } 1272 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1273 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1274 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1275 for (j=0; j<n; j++) { 1276 xb = x + 4*(*idx++); 1277 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1278 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1279 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1280 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1281 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1282 v += 16; 1283 } 1284 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1285 if (!usecprow) { 1286 z += 4; y += 4; 1287 } 1288 } 1289 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1290 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1291 if (zz != yy) { 1292 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1293 } 1294 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1295 PetscFunctionReturn(0); 1296 } 1297 1298 #undef __FUNCT__ 1299 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1300 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1301 { 1302 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1303 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1304 PetscScalar *yarray,*zarray; 1305 MatScalar *v; 1306 PetscErrorCode ierr; 1307 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1308 PetscBool usecprow=a->compressedrow.use; 1309 1310 PetscFunctionBegin; 1311 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1312 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1313 if (zz != yy) { 1314 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1315 } else { 1316 zarray = yarray; 1317 } 1318 1319 idx = a->j; 1320 v = a->a; 1321 if (usecprow) { 1322 if (zz != yy) { 1323 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1324 } 1325 mbs = a->compressedrow.nrows; 1326 ii = a->compressedrow.i; 1327 ridx = a->compressedrow.rindex; 1328 } else { 1329 ii = a->i; 1330 y = yarray; 1331 z = zarray; 1332 } 1333 1334 for (i=0; i<mbs; i++) { 1335 n = ii[1] - ii[0]; ii++; 1336 if (usecprow) { 1337 z = zarray + 5*ridx[i]; 1338 y = yarray + 5*ridx[i]; 1339 } 1340 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1341 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1342 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1343 for (j=0; j<n; j++) { 1344 xb = x + 5*(*idx++); 1345 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1346 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1347 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1348 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1349 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1350 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1351 v += 25; 1352 } 1353 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1354 if (!usecprow) { 1355 z += 5; y += 5; 1356 } 1357 } 1358 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1359 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1360 if (zz != yy) { 1361 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1362 } 1363 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1364 PetscFunctionReturn(0); 1365 } 1366 #undef __FUNCT__ 1367 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1368 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1369 { 1370 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1371 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1372 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1373 MatScalar *v; 1374 PetscErrorCode ierr; 1375 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1376 PetscBool usecprow=a->compressedrow.use; 1377 1378 PetscFunctionBegin; 1379 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1381 if (zz != yy) { 1382 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1383 } else { 1384 zarray = yarray; 1385 } 1386 1387 idx = a->j; 1388 v = a->a; 1389 if (usecprow) { 1390 if (zz != yy) { 1391 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1392 } 1393 mbs = a->compressedrow.nrows; 1394 ii = a->compressedrow.i; 1395 ridx = a->compressedrow.rindex; 1396 } else { 1397 ii = a->i; 1398 y = yarray; 1399 z = zarray; 1400 } 1401 1402 for (i=0; i<mbs; i++) { 1403 n = ii[1] - ii[0]; ii++; 1404 if (usecprow) { 1405 z = zarray + 6*ridx[i]; 1406 y = yarray + 6*ridx[i]; 1407 } 1408 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1409 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1410 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1411 for (j=0; j<n; j++) { 1412 xb = x + 6*(*idx++); 1413 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1414 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1415 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1416 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1417 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1418 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1419 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1420 v += 36; 1421 } 1422 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1423 if (!usecprow) { 1424 z += 6; y += 6; 1425 } 1426 } 1427 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1428 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1429 if (zz != yy) { 1430 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1431 } 1432 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1433 PetscFunctionReturn(0); 1434 } 1435 1436 #undef __FUNCT__ 1437 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1438 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1439 { 1440 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1441 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1442 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1443 MatScalar *v; 1444 PetscErrorCode ierr; 1445 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1446 PetscBool usecprow=a->compressedrow.use; 1447 1448 PetscFunctionBegin; 1449 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1450 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1451 if (zz != yy) { 1452 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1453 } else { 1454 zarray = yarray; 1455 } 1456 1457 idx = a->j; 1458 v = a->a; 1459 if (usecprow) { 1460 if (zz != yy) { 1461 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1462 } 1463 mbs = a->compressedrow.nrows; 1464 ii = a->compressedrow.i; 1465 ridx = a->compressedrow.rindex; 1466 } else { 1467 ii = a->i; 1468 y = yarray; 1469 z = zarray; 1470 } 1471 1472 for (i=0; i<mbs; i++) { 1473 n = ii[1] - ii[0]; ii++; 1474 if (usecprow) { 1475 z = zarray + 7*ridx[i]; 1476 y = yarray + 7*ridx[i]; 1477 } 1478 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1479 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1480 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1481 for (j=0; j<n; j++) { 1482 xb = x + 7*(*idx++); 1483 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1484 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1485 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1486 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1487 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1488 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1489 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1490 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1491 v += 49; 1492 } 1493 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1494 if (!usecprow) { 1495 z += 7; y += 7; 1496 } 1497 } 1498 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1499 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1500 if (zz != yy) { 1501 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1502 } 1503 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1504 PetscFunctionReturn(0); 1505 } 1506 1507 #undef __FUNCT__ 1508 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1509 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1510 { 1511 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1512 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1513 MatScalar *v; 1514 PetscErrorCode ierr; 1515 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1516 PetscInt ncols,k,*ridx=PETSC_NULL; 1517 PetscBool usecprow=a->compressedrow.use; 1518 1519 PetscFunctionBegin; 1520 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1521 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1522 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1523 1524 idx = a->j; 1525 v = a->a; 1526 if (usecprow) { 1527 mbs = a->compressedrow.nrows; 1528 ii = a->compressedrow.i; 1529 ridx = a->compressedrow.rindex; 1530 } else { 1531 mbs = a->mbs; 1532 ii = a->i; 1533 z = zarray; 1534 } 1535 1536 if (!a->mult_work) { 1537 k = PetscMax(A->rmap->n,A->cmap->n); 1538 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1539 } 1540 work = a->mult_work; 1541 for (i=0; i<mbs; i++) { 1542 n = ii[1] - ii[0]; ii++; 1543 ncols = n*bs; 1544 workt = work; 1545 for (j=0; j<n; j++) { 1546 xb = x + bs*(*idx++); 1547 for (k=0; k<bs; k++) workt[k] = xb[k]; 1548 workt += bs; 1549 } 1550 if (usecprow) z = zarray + bs*ridx[i]; 1551 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1552 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1553 v += n*bs2; 1554 if (!usecprow) { 1555 z += bs; 1556 } 1557 } 1558 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1559 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1560 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1561 PetscFunctionReturn(0); 1562 } 1563 1564 #undef __FUNCT__ 1565 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1566 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1567 { 1568 PetscScalar zero = 0.0; 1569 PetscErrorCode ierr; 1570 1571 PetscFunctionBegin; 1572 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1573 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1574 PetscFunctionReturn(0); 1575 } 1576 1577 #undef __FUNCT__ 1578 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1579 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1580 { 1581 PetscScalar zero = 0.0; 1582 PetscErrorCode ierr; 1583 1584 PetscFunctionBegin; 1585 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1586 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1587 PetscFunctionReturn(0); 1588 } 1589 1590 #undef __FUNCT__ 1591 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1592 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1593 { 1594 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1595 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1596 MatScalar *v; 1597 PetscErrorCode ierr; 1598 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1599 Mat_CompressedRow cprow = a->compressedrow; 1600 PetscBool usecprow=cprow.use; 1601 1602 PetscFunctionBegin; 1603 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1604 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1605 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1606 1607 idx = a->j; 1608 v = a->a; 1609 if (usecprow) { 1610 mbs = cprow.nrows; 1611 ii = cprow.i; 1612 ridx = cprow.rindex; 1613 } else { 1614 mbs=a->mbs; 1615 ii = a->i; 1616 xb = x; 1617 } 1618 1619 switch (bs) { 1620 case 1: 1621 for (i=0; i<mbs; i++) { 1622 if (usecprow) xb = x + ridx[i]; 1623 x1 = xb[0]; 1624 ib = idx + ii[0]; 1625 n = ii[1] - ii[0]; ii++; 1626 for (j=0; j<n; j++) { 1627 rval = ib[j]; 1628 z[rval] += PetscConj(*v) * x1; 1629 v++; 1630 } 1631 if (!usecprow) xb++; 1632 } 1633 break; 1634 case 2: 1635 for (i=0; i<mbs; i++) { 1636 if (usecprow) xb = x + 2*ridx[i]; 1637 x1 = xb[0]; x2 = xb[1]; 1638 ib = idx + ii[0]; 1639 n = ii[1] - ii[0]; ii++; 1640 for (j=0; j<n; j++) { 1641 rval = ib[j]*2; 1642 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1643 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1644 v += 4; 1645 } 1646 if (!usecprow) xb += 2; 1647 } 1648 break; 1649 case 3: 1650 for (i=0; i<mbs; i++) { 1651 if (usecprow) xb = x + 3*ridx[i]; 1652 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1653 ib = idx + ii[0]; 1654 n = ii[1] - ii[0]; ii++; 1655 for (j=0; j<n; j++) { 1656 rval = ib[j]*3; 1657 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1658 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1659 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1660 v += 9; 1661 } 1662 if (!usecprow) xb += 3; 1663 } 1664 break; 1665 case 4: 1666 for (i=0; i<mbs; i++) { 1667 if (usecprow) xb = x + 4*ridx[i]; 1668 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1669 ib = idx + ii[0]; 1670 n = ii[1] - ii[0]; ii++; 1671 for (j=0; j<n; j++) { 1672 rval = ib[j]*4; 1673 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1674 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1675 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1676 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1677 v += 16; 1678 } 1679 if (!usecprow) xb += 4; 1680 } 1681 break; 1682 case 5: 1683 for (i=0; i<mbs; i++) { 1684 if (usecprow) xb = x + 5*ridx[i]; 1685 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1686 x4 = xb[3]; x5 = xb[4]; 1687 ib = idx + ii[0]; 1688 n = ii[1] - ii[0]; ii++; 1689 for (j=0; j<n; j++) { 1690 rval = ib[j]*5; 1691 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1692 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1693 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1694 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1695 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1696 v += 25; 1697 } 1698 if (!usecprow) xb += 5; 1699 } 1700 break; 1701 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1702 PetscInt ncols,k; 1703 PetscScalar *work,*workt,*xtmp; 1704 1705 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1706 if (!a->mult_work) { 1707 k = PetscMax(A->rmap->n,A->cmap->n); 1708 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1709 } 1710 work = a->mult_work; 1711 xtmp = x; 1712 for (i=0; i<mbs; i++) { 1713 n = ii[1] - ii[0]; ii++; 1714 ncols = n*bs; 1715 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1716 if (usecprow) { 1717 xtmp = x + bs*ridx[i]; 1718 } 1719 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1720 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1721 v += n*bs2; 1722 if (!usecprow) xtmp += bs; 1723 workt = work; 1724 for (j=0; j<n; j++) { 1725 zb = z + bs*(*idx++); 1726 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1727 workt += bs; 1728 } 1729 } 1730 } 1731 } 1732 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1733 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1734 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1735 PetscFunctionReturn(0); 1736 } 1737 1738 #undef __FUNCT__ 1739 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1740 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1741 { 1742 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1743 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1744 MatScalar *v; 1745 PetscErrorCode ierr; 1746 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1747 Mat_CompressedRow cprow = a->compressedrow; 1748 PetscBool usecprow=cprow.use; 1749 1750 PetscFunctionBegin; 1751 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1752 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1753 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1754 1755 idx = a->j; 1756 v = a->a; 1757 if (usecprow) { 1758 mbs = cprow.nrows; 1759 ii = cprow.i; 1760 ridx = cprow.rindex; 1761 } else { 1762 mbs=a->mbs; 1763 ii = a->i; 1764 xb = x; 1765 } 1766 1767 switch (bs) { 1768 case 1: 1769 for (i=0; i<mbs; i++) { 1770 if (usecprow) xb = x + ridx[i]; 1771 x1 = xb[0]; 1772 ib = idx + ii[0]; 1773 n = ii[1] - ii[0]; ii++; 1774 for (j=0; j<n; j++) { 1775 rval = ib[j]; 1776 z[rval] += *v * x1; 1777 v++; 1778 } 1779 if (!usecprow) xb++; 1780 } 1781 break; 1782 case 2: 1783 for (i=0; i<mbs; i++) { 1784 if (usecprow) xb = x + 2*ridx[i]; 1785 x1 = xb[0]; x2 = xb[1]; 1786 ib = idx + ii[0]; 1787 n = ii[1] - ii[0]; ii++; 1788 for (j=0; j<n; j++) { 1789 rval = ib[j]*2; 1790 z[rval++] += v[0]*x1 + v[1]*x2; 1791 z[rval++] += v[2]*x1 + v[3]*x2; 1792 v += 4; 1793 } 1794 if (!usecprow) xb += 2; 1795 } 1796 break; 1797 case 3: 1798 for (i=0; i<mbs; i++) { 1799 if (usecprow) xb = x + 3*ridx[i]; 1800 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1801 ib = idx + ii[0]; 1802 n = ii[1] - ii[0]; ii++; 1803 for (j=0; j<n; j++) { 1804 rval = ib[j]*3; 1805 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1806 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1807 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1808 v += 9; 1809 } 1810 if (!usecprow) xb += 3; 1811 } 1812 break; 1813 case 4: 1814 for (i=0; i<mbs; i++) { 1815 if (usecprow) xb = x + 4*ridx[i]; 1816 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1817 ib = idx + ii[0]; 1818 n = ii[1] - ii[0]; ii++; 1819 for (j=0; j<n; j++) { 1820 rval = ib[j]*4; 1821 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1822 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1823 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1824 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1825 v += 16; 1826 } 1827 if (!usecprow) xb += 4; 1828 } 1829 break; 1830 case 5: 1831 for (i=0; i<mbs; i++) { 1832 if (usecprow) xb = x + 5*ridx[i]; 1833 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1834 x4 = xb[3]; x5 = xb[4]; 1835 ib = idx + ii[0]; 1836 n = ii[1] - ii[0]; ii++; 1837 for (j=0; j<n; j++) { 1838 rval = ib[j]*5; 1839 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1840 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1841 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1842 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1843 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1844 v += 25; 1845 } 1846 if (!usecprow) xb += 5; 1847 } 1848 break; 1849 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1850 PetscInt ncols,k; 1851 PetscScalar *work,*workt,*xtmp; 1852 1853 if (!a->mult_work) { 1854 k = PetscMax(A->rmap->n,A->cmap->n); 1855 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1856 } 1857 work = a->mult_work; 1858 xtmp = x; 1859 for (i=0; i<mbs; i++) { 1860 n = ii[1] - ii[0]; ii++; 1861 ncols = n*bs; 1862 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1863 if (usecprow) { 1864 xtmp = x + bs*ridx[i]; 1865 } 1866 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1867 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1868 v += n*bs2; 1869 if (!usecprow) xtmp += bs; 1870 workt = work; 1871 for (j=0; j<n; j++) { 1872 zb = z + bs*(*idx++); 1873 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1874 workt += bs; 1875 } 1876 } 1877 } 1878 } 1879 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1880 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1881 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1882 PetscFunctionReturn(0); 1883 } 1884 1885 #undef __FUNCT__ 1886 #define __FUNCT__ "MatScale_SeqBAIJ" 1887 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1888 { 1889 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1890 PetscInt totalnz = a->bs2*a->nz; 1891 PetscScalar oalpha = alpha; 1892 PetscErrorCode ierr; 1893 PetscBLASInt one = 1,tnz; 1894 1895 PetscFunctionBegin; 1896 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1897 BLASscal_(&tnz,&oalpha,a->a,&one); 1898 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1899 PetscFunctionReturn(0); 1900 } 1901 1902 #undef __FUNCT__ 1903 #define __FUNCT__ "MatNorm_SeqBAIJ" 1904 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1905 { 1906 PetscErrorCode ierr; 1907 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1908 MatScalar *v = a->a; 1909 PetscReal sum = 0.0; 1910 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1911 1912 PetscFunctionBegin; 1913 if (type == NORM_FROBENIUS) { 1914 for (i=0; i< bs2*nz; i++) { 1915 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1916 } 1917 *norm = PetscSqrtReal(sum); 1918 } else if (type == NORM_1) { /* maximum column sum */ 1919 PetscReal *tmp; 1920 PetscInt *bcol = a->j; 1921 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1922 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1923 for (i=0; i<nz; i++) { 1924 for (j=0; j<bs; j++) { 1925 k1 = bs*(*bcol) + j; /* column index */ 1926 for (k=0; k<bs; k++) { 1927 tmp[k1] += PetscAbsScalar(*v); v++; 1928 } 1929 } 1930 bcol++; 1931 } 1932 *norm = 0.0; 1933 for (j=0; j<A->cmap->n; j++) { 1934 if (tmp[j] > *norm) *norm = tmp[j]; 1935 } 1936 ierr = PetscFree(tmp);CHKERRQ(ierr); 1937 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1938 *norm = 0.0; 1939 for (k=0; k<bs; k++) { 1940 for (j=0; j<a->mbs; j++) { 1941 v = a->a + bs2*a->i[j] + k; 1942 sum = 0.0; 1943 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1944 for (k1=0; k1<bs; k1++) { 1945 sum += PetscAbsScalar(*v); 1946 v += bs; 1947 } 1948 } 1949 if (sum > *norm) *norm = sum; 1950 } 1951 } 1952 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1953 PetscFunctionReturn(0); 1954 } 1955 1956 1957 #undef __FUNCT__ 1958 #define __FUNCT__ "MatEqual_SeqBAIJ" 1959 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1960 { 1961 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1962 PetscErrorCode ierr; 1963 1964 PetscFunctionBegin; 1965 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1966 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1967 *flg = PETSC_FALSE; 1968 PetscFunctionReturn(0); 1969 } 1970 1971 /* if the a->i are the same */ 1972 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1973 if (!*flg) { 1974 PetscFunctionReturn(0); 1975 } 1976 1977 /* if a->j are the same */ 1978 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1979 if (!*flg) { 1980 PetscFunctionReturn(0); 1981 } 1982 /* if a->a are the same */ 1983 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1984 PetscFunctionReturn(0); 1985 1986 } 1987 1988 #undef __FUNCT__ 1989 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1990 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1991 { 1992 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1993 PetscErrorCode ierr; 1994 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1995 PetscScalar *x,zero = 0.0; 1996 MatScalar *aa,*aa_j; 1997 1998 PetscFunctionBegin; 1999 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2000 bs = A->rmap->bs; 2001 aa = a->a; 2002 ai = a->i; 2003 aj = a->j; 2004 ambs = a->mbs; 2005 bs2 = a->bs2; 2006 2007 ierr = VecSet(v,zero);CHKERRQ(ierr); 2008 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2009 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2010 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2011 for (i=0; i<ambs; i++) { 2012 for (j=ai[i]; j<ai[i+1]; j++) { 2013 if (aj[j] == i) { 2014 row = i*bs; 2015 aa_j = aa+j*bs2; 2016 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2017 break; 2018 } 2019 } 2020 } 2021 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2022 PetscFunctionReturn(0); 2023 } 2024 2025 #undef __FUNCT__ 2026 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2027 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2028 { 2029 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2030 const PetscScalar *l,*r,*li,*ri; 2031 PetscScalar x; 2032 MatScalar *aa, *v; 2033 PetscErrorCode ierr; 2034 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2035 const PetscInt *ai,*aj; 2036 2037 PetscFunctionBegin; 2038 ai = a->i; 2039 aj = a->j; 2040 aa = a->a; 2041 m = A->rmap->n; 2042 n = A->cmap->n; 2043 bs = A->rmap->bs; 2044 mbs = a->mbs; 2045 bs2 = a->bs2; 2046 if (ll) { 2047 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2048 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2049 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2050 for (i=0; i<mbs; i++) { /* for each block row */ 2051 M = ai[i+1] - ai[i]; 2052 li = l + i*bs; 2053 v = aa + bs2*ai[i]; 2054 for (j=0; j<M; j++) { /* for each block */ 2055 for (k=0; k<bs2; k++) { 2056 (*v++) *= li[k%bs]; 2057 } 2058 } 2059 } 2060 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2061 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2062 } 2063 2064 if (rr) { 2065 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2066 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2067 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2068 for (i=0; i<mbs; i++) { /* for each block row */ 2069 iai = ai[i]; 2070 M = ai[i+1] - iai; 2071 v = aa + bs2*iai; 2072 for (j=0; j<M; j++) { /* for each block */ 2073 ri = r + bs*aj[iai+j]; 2074 for (k=0; k<bs; k++) { 2075 x = ri[k]; 2076 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2077 v += bs; 2078 } 2079 } 2080 } 2081 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2082 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2083 } 2084 PetscFunctionReturn(0); 2085 } 2086 2087 2088 #undef __FUNCT__ 2089 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2090 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2091 { 2092 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2093 2094 PetscFunctionBegin; 2095 info->block_size = a->bs2; 2096 info->nz_allocated = a->bs2*a->maxnz; 2097 info->nz_used = a->bs2*a->nz; 2098 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2099 info->assemblies = A->num_ass; 2100 info->mallocs = A->info.mallocs; 2101 info->memory = ((PetscObject)A)->mem; 2102 if (A->factortype) { 2103 info->fill_ratio_given = A->info.fill_ratio_given; 2104 info->fill_ratio_needed = A->info.fill_ratio_needed; 2105 info->factor_mallocs = A->info.factor_mallocs; 2106 } else { 2107 info->fill_ratio_given = 0; 2108 info->fill_ratio_needed = 0; 2109 info->factor_mallocs = 0; 2110 } 2111 PetscFunctionReturn(0); 2112 } 2113 2114 2115 #undef __FUNCT__ 2116 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2117 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2118 { 2119 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2120 PetscErrorCode ierr; 2121 2122 PetscFunctionBegin; 2123 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2124 PetscFunctionReturn(0); 2125 } 2126