1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "petscblaslapack.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 15 const PetscInt *idx; 16 PetscInt start,end,*ai,*aj,bs,*nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 28 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 29 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 30 31 for (i=0; i<is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 38 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j=0; j<n ; ++j){ 42 ival = idx[j]/bs; /* convert the indices into block indices */ 43 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++){ /* for each overlap*/ 51 n = isz; 52 for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row+1]; 56 for (l = start; l<end ; l++){ 57 val = aj[l]; 58 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j=0; j<isz; j++) { 64 for (k=0; k<bs; k++) 65 nidx2[j*bs+k] = nidx[j]*bs+k; 66 } 67 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 68 } 69 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 70 ierr = PetscFree(nidx);CHKERRQ(ierr); 71 ierr = PetscFree(nidx2);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 #undef __FUNCT__ 76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 78 { 79 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 80 PetscErrorCode ierr; 81 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 82 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 83 const PetscInt *irow,*icol; 84 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 85 PetscInt *aj = a->j,*ai = a->i; 86 MatScalar *mat_a; 87 Mat C; 88 PetscBool flag,sorted; 89 90 PetscFunctionBegin; 91 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 92 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 93 94 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 95 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 98 99 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 100 ssmap = smap; 101 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 102 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 103 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 104 /* determine lens of each row */ 105 for (i=0; i<nrows; i++) { 106 kstart = ai[irow[i]]; 107 kend = kstart + a->ilen[irow[i]]; 108 lens[i] = 0; 109 for (k=kstart; k<kend; k++) { 110 if (ssmap[aj[k]]) { 111 lens[i]++; 112 } 113 } 114 } 115 /* Create and fill new matrix */ 116 if (scall == MAT_REUSE_MATRIX) { 117 c = (Mat_SeqBAIJ *)((*B)->data); 118 119 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 120 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 121 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 122 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 123 C = *B; 124 } else { 125 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 126 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 127 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 128 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 129 } 130 c = (Mat_SeqBAIJ *)(C->data); 131 for (i=0; i<nrows; i++) { 132 row = irow[i]; 133 kstart = ai[row]; 134 kend = kstart + a->ilen[row]; 135 mat_i = c->i[i]; 136 mat_j = c->j + mat_i; 137 mat_a = c->a + mat_i*bs2; 138 mat_ilen = c->ilen + i; 139 for (k=kstart; k<kend; k++) { 140 if ((tcol=ssmap[a->j[k]])) { 141 *mat_j++ = tcol - 1; 142 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 143 mat_a += bs2; 144 (*mat_ilen)++; 145 } 146 } 147 } 148 149 /* Free work space */ 150 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 151 ierr = PetscFree(smap);CHKERRQ(ierr); 152 ierr = PetscFree(lens);CHKERRQ(ierr); 153 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 154 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 155 156 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 157 *B = C; 158 PetscFunctionReturn(0); 159 } 160 161 #undef __FUNCT__ 162 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 164 { 165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 166 IS is1,is2; 167 PetscErrorCode ierr; 168 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 169 const PetscInt *irow,*icol; 170 171 PetscFunctionBegin; 172 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 173 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 176 177 /* Verify if the indices corespond to each element in a block 178 and form the IS with compressed IS */ 179 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 180 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 181 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 182 count = 0; 183 for (i=0; i<a->mbs; i++) { 184 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 185 if (vary[i]==bs) iary[count++] = i; 186 } 187 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 188 189 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 190 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 191 count = 0; 192 for (i=0; i<a->mbs; i++) { 193 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 194 if (vary[i]==bs) iary[count++] = i; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 197 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 198 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 199 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 200 201 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 202 ISDestroy(is1); 203 ISDestroy(is2); 204 PetscFunctionReturn(0); 205 } 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 209 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 210 { 211 PetscErrorCode ierr; 212 PetscInt i; 213 214 PetscFunctionBegin; 215 if (scall == MAT_INITIAL_MATRIX) { 216 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 217 } 218 219 for (i=0; i<n; i++) { 220 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 226 /* -------------------------------------------------------*/ 227 /* Should check that shapes of vectors and matrices match */ 228 /* -------------------------------------------------------*/ 229 230 #undef __FUNCT__ 231 #define __FUNCT__ "MatMult_SeqBAIJ_1" 232 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 233 { 234 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 235 PetscScalar *z,sum; 236 const PetscScalar *x; 237 const MatScalar *v; 238 PetscErrorCode ierr; 239 PetscInt mbs,i,n,nonzerorow=0; 240 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 241 PetscBool usecprow=a->compressedrow.use; 242 243 PetscFunctionBegin; 244 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 245 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 246 247 if (usecprow){ 248 mbs = a->compressedrow.nrows; 249 ii = a->compressedrow.i; 250 ridx = a->compressedrow.rindex; 251 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 252 } else { 253 mbs = a->mbs; 254 ii = a->i; 255 } 256 257 for (i=0; i<mbs; i++) { 258 n = ii[1] - ii[0]; 259 v = a->a + ii[0]; 260 idx = a->j + ii[0]; 261 ii++; 262 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 263 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 264 sum = 0.0; 265 PetscSparseDensePlusDot(sum,x,v,idx,n); 266 if (usecprow){ 267 z[ridx[i]] = sum; 268 } else { 269 nonzerorow += (n>0); 270 z[i] = sum; 271 } 272 } 273 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 274 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 275 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 276 PetscFunctionReturn(0); 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatMult_SeqBAIJ_2" 281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 282 { 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 284 PetscScalar *z = 0,sum1,sum2,*zarray; 285 const PetscScalar *x,*xb; 286 PetscScalar x1,x2; 287 const MatScalar *v; 288 PetscErrorCode ierr; 289 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 290 PetscBool usecprow=a->compressedrow.use; 291 292 PetscFunctionBegin; 293 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 294 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 295 296 idx = a->j; 297 v = a->a; 298 if (usecprow){ 299 mbs = a->compressedrow.nrows; 300 ii = a->compressedrow.i; 301 ridx = a->compressedrow.rindex; 302 } else { 303 mbs = a->mbs; 304 ii = a->i; 305 z = zarray; 306 } 307 308 for (i=0; i<mbs; i++) { 309 n = ii[1] - ii[0]; ii++; 310 sum1 = 0.0; sum2 = 0.0; 311 nonzerorow += (n>0); 312 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 313 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 314 for (j=0; j<n; j++) { 315 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 316 sum1 += v[0]*x1 + v[2]*x2; 317 sum2 += v[1]*x1 + v[3]*x2; 318 v += 4; 319 } 320 if (usecprow) z = zarray + 2*ridx[i]; 321 z[0] = sum1; z[1] = sum2; 322 if (!usecprow) z += 2; 323 } 324 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 325 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 326 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 327 PetscFunctionReturn(0); 328 } 329 330 #undef __FUNCT__ 331 #define __FUNCT__ "MatMult_SeqBAIJ_3" 332 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 333 { 334 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 335 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 336 const PetscScalar *x,*xb; 337 const MatScalar *v; 338 PetscErrorCode ierr; 339 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 340 PetscBool usecprow=a->compressedrow.use; 341 342 343 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 344 #pragma disjoint(*v,*z,*xb) 345 #endif 346 347 PetscFunctionBegin; 348 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 349 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 350 351 idx = a->j; 352 v = a->a; 353 if (usecprow){ 354 mbs = a->compressedrow.nrows; 355 ii = a->compressedrow.i; 356 ridx = a->compressedrow.rindex; 357 } else { 358 mbs = a->mbs; 359 ii = a->i; 360 z = zarray; 361 } 362 363 for (i=0; i<mbs; i++) { 364 n = ii[1] - ii[0]; ii++; 365 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 366 nonzerorow += (n>0); 367 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 368 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 369 for (j=0; j<n; j++) { 370 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 371 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 372 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 373 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 374 v += 9; 375 } 376 if (usecprow) z = zarray + 3*ridx[i]; 377 z[0] = sum1; z[1] = sum2; z[2] = sum3; 378 if (!usecprow) z += 3; 379 } 380 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 381 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 382 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatMult_SeqBAIJ_4" 388 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 389 { 390 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 391 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 392 const PetscScalar *x,*xb; 393 const MatScalar *v; 394 PetscErrorCode ierr; 395 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 396 PetscBool usecprow=a->compressedrow.use; 397 398 PetscFunctionBegin; 399 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 400 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 401 402 idx = a->j; 403 v = a->a; 404 if (usecprow){ 405 mbs = a->compressedrow.nrows; 406 ii = a->compressedrow.i; 407 ridx = a->compressedrow.rindex; 408 } else { 409 mbs = a->mbs; 410 ii = a->i; 411 z = zarray; 412 } 413 414 for (i=0; i<mbs; i++) { 415 n = ii[1] - ii[0]; ii++; 416 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 417 nonzerorow += (n>0); 418 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 419 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 420 for (j=0; j<n; j++) { 421 xb = x + 4*(*idx++); 422 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 423 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 424 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 425 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 426 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 427 v += 16; 428 } 429 if (usecprow) z = zarray + 4*ridx[i]; 430 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 431 if (!usecprow) z += 4; 432 } 433 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 434 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 435 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 436 PetscFunctionReturn(0); 437 } 438 439 #undef __FUNCT__ 440 #define __FUNCT__ "MatMult_SeqBAIJ_5" 441 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 442 { 443 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 444 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 445 const PetscScalar *xb,*x; 446 const MatScalar *v; 447 PetscErrorCode ierr; 448 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 449 PetscInt mbs,i,j,n,nonzerorow=0; 450 PetscBool usecprow=a->compressedrow.use; 451 452 PetscFunctionBegin; 453 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 454 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 455 456 idx = a->j; 457 v = a->a; 458 if (usecprow){ 459 mbs = a->compressedrow.nrows; 460 ii = a->compressedrow.i; 461 ridx = a->compressedrow.rindex; 462 } else { 463 mbs = a->mbs; 464 ii = a->i; 465 z = zarray; 466 } 467 468 for (i=0; i<mbs; i++) { 469 n = ii[1] - ii[0]; ii++; 470 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 471 nonzerorow += (n>0); 472 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 473 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 474 for (j=0; j<n; j++) { 475 xb = x + 5*(*idx++); 476 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 477 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 478 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 479 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 480 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 481 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 482 v += 25; 483 } 484 if (usecprow) z = zarray + 5*ridx[i]; 485 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 486 if (!usecprow) z += 5; 487 } 488 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 489 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 490 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 491 PetscFunctionReturn(0); 492 } 493 494 495 #undef __FUNCT__ 496 #define __FUNCT__ "MatMult_SeqBAIJ_6" 497 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 498 { 499 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 500 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 501 const PetscScalar *x,*xb; 502 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 503 const MatScalar *v; 504 PetscErrorCode ierr; 505 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 506 PetscBool usecprow=a->compressedrow.use; 507 508 PetscFunctionBegin; 509 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 510 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 511 512 idx = a->j; 513 v = a->a; 514 if (usecprow){ 515 mbs = a->compressedrow.nrows; 516 ii = a->compressedrow.i; 517 ridx = a->compressedrow.rindex; 518 } else { 519 mbs = a->mbs; 520 ii = a->i; 521 z = zarray; 522 } 523 524 for (i=0; i<mbs; i++) { 525 n = ii[1] - ii[0]; ii++; 526 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 527 nonzerorow += (n>0); 528 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 529 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 530 for (j=0; j<n; j++) { 531 xb = x + 6*(*idx++); 532 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 533 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 534 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 535 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 536 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 537 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 538 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 539 v += 36; 540 } 541 if (usecprow) z = zarray + 6*ridx[i]; 542 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 543 if (!usecprow) z += 6; 544 } 545 546 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 547 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 548 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 549 PetscFunctionReturn(0); 550 } 551 552 #undef __FUNCT__ 553 #define __FUNCT__ "MatMult_SeqBAIJ_7" 554 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 555 { 556 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 557 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 558 const PetscScalar *x,*xb; 559 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 560 const MatScalar *v; 561 PetscErrorCode ierr; 562 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 563 PetscBool usecprow=a->compressedrow.use; 564 565 PetscFunctionBegin; 566 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 567 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 568 569 idx = a->j; 570 v = a->a; 571 if (usecprow){ 572 mbs = a->compressedrow.nrows; 573 ii = a->compressedrow.i; 574 ridx = a->compressedrow.rindex; 575 } else { 576 mbs = a->mbs; 577 ii = a->i; 578 z = zarray; 579 } 580 581 for (i=0; i<mbs; i++) { 582 n = ii[1] - ii[0]; ii++; 583 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 584 nonzerorow += (n>0); 585 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 586 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 587 for (j=0; j<n; j++) { 588 xb = x + 7*(*idx++); 589 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 590 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 591 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 592 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 593 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 594 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 595 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 596 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 597 v += 49; 598 } 599 if (usecprow) z = zarray + 7*ridx[i]; 600 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 601 if (!usecprow) z += 7; 602 } 603 604 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 605 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 606 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 607 PetscFunctionReturn(0); 608 } 609 610 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 611 /* Default MatMult for block size 15 */ 612 613 #undef __FUNCT__ 614 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 615 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 616 { 617 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 618 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 619 const PetscScalar *x,*xb; 620 PetscScalar *zarray,xv; 621 const MatScalar *v; 622 PetscErrorCode ierr; 623 const PetscInt *ii,*ij=a->j,*idx; 624 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 625 PetscBool usecprow=a->compressedrow.use; 626 627 PetscFunctionBegin; 628 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 629 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 630 631 v = a->a; 632 if (usecprow){ 633 mbs = a->compressedrow.nrows; 634 ii = a->compressedrow.i; 635 ridx = a->compressedrow.rindex; 636 } else { 637 mbs = a->mbs; 638 ii = a->i; 639 z = zarray; 640 } 641 642 for (i=0; i<mbs; i++) { 643 n = ii[i+1] - ii[i]; 644 idx = ij + ii[i]; 645 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 646 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 647 648 nonzerorow += (n>0); 649 for (j=0; j<n; j++) { 650 xb = x + 15*(idx[j]); 651 652 for(k=0;k<15;k++){ 653 xv = xb[k]; 654 sum1 += v[0]*xv; 655 sum2 += v[1]*xv; 656 sum3 += v[2]*xv; 657 sum4 += v[3]*xv; 658 sum5 += v[4]*xv; 659 sum6 += v[5]*xv; 660 sum7 += v[6]*xv; 661 sum8 += v[7]*xv; 662 sum9 += v[8]*xv; 663 sum10 += v[9]*xv; 664 sum11 += v[10]*xv; 665 sum12 += v[11]*xv; 666 sum13 += v[12]*xv; 667 sum14 += v[13]*xv; 668 sum15 += v[14]*xv; 669 v += 15; 670 } 671 } 672 if (usecprow) z = zarray + 15*ridx[i]; 673 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 674 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 675 676 if (!usecprow) z += 15; 677 } 678 679 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 680 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 681 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 686 #undef __FUNCT__ 687 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 688 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 689 { 690 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 691 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 692 const PetscScalar *x,*xb; 693 PetscScalar x1,x2,x3,x4,*zarray; 694 const MatScalar *v; 695 PetscErrorCode ierr; 696 const PetscInt *ii,*ij=a->j,*idx; 697 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 698 PetscBool usecprow=a->compressedrow.use; 699 700 PetscFunctionBegin; 701 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 702 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 703 704 v = a->a; 705 if (usecprow){ 706 mbs = a->compressedrow.nrows; 707 ii = a->compressedrow.i; 708 ridx = a->compressedrow.rindex; 709 } else { 710 mbs = a->mbs; 711 ii = a->i; 712 z = zarray; 713 } 714 715 for (i=0; i<mbs; i++) { 716 n = ii[i+1] - ii[i]; 717 idx = ij + ii[i]; 718 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 719 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 720 721 nonzerorow += (n>0); 722 for (j=0; j<n; j++) { 723 xb = x + 15*(idx[j]); 724 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 725 726 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 727 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 728 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 729 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 730 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 731 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 732 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 733 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 734 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 735 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 736 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 737 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 738 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 739 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 740 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 741 742 v += 60; 743 744 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 745 746 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 747 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 748 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 749 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 750 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 751 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 752 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 753 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 754 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 755 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 756 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 757 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 758 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 759 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 760 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 761 v += 60; 762 763 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 764 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 765 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 766 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 767 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 768 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 769 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 770 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 771 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 772 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 773 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 774 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 775 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 776 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 777 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 778 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 779 v += 60; 780 781 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 782 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 783 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 784 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 785 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 786 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 787 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 788 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 789 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 790 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 791 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 792 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 793 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 794 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 795 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 796 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 797 v += 45; 798 } 799 if (usecprow) z = zarray + 15*ridx[i]; 800 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 801 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 802 803 if (!usecprow) z += 15; 804 } 805 806 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 807 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 808 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 809 PetscFunctionReturn(0); 810 } 811 812 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 813 #undef __FUNCT__ 814 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 815 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 816 { 817 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 818 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 819 const PetscScalar *x,*xb; 820 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 821 const MatScalar *v; 822 PetscErrorCode ierr; 823 const PetscInt *ii,*ij=a->j,*idx; 824 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 825 PetscBool usecprow=a->compressedrow.use; 826 827 PetscFunctionBegin; 828 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 829 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 830 831 v = a->a; 832 if (usecprow){ 833 mbs = a->compressedrow.nrows; 834 ii = a->compressedrow.i; 835 ridx = a->compressedrow.rindex; 836 } else { 837 mbs = a->mbs; 838 ii = a->i; 839 z = zarray; 840 } 841 842 for (i=0; i<mbs; i++) { 843 n = ii[i+1] - ii[i]; 844 idx = ij + ii[i]; 845 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 846 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 847 848 nonzerorow += (n>0); 849 for (j=0; j<n; j++) { 850 xb = x + 15*(idx[j]); 851 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 852 x8 = xb[7]; 853 854 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 855 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 856 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 857 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 858 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 859 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 860 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 861 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 862 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 863 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 864 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 865 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 866 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 867 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 868 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 869 v += 120; 870 871 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 872 873 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 874 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 875 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 876 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 877 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 878 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 879 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 880 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 881 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 882 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 883 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 884 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 885 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 886 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 887 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 888 v += 105; 889 } 890 if (usecprow) z = zarray + 15*ridx[i]; 891 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 892 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 893 894 if (!usecprow) z += 15; 895 } 896 897 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 898 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 899 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 900 PetscFunctionReturn(0); 901 } 902 903 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 904 905 #undef __FUNCT__ 906 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 907 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 908 { 909 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 910 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 911 const PetscScalar *x,*xb; 912 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 913 const MatScalar *v; 914 PetscErrorCode ierr; 915 const PetscInt *ii,*ij=a->j,*idx; 916 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 917 PetscBool usecprow=a->compressedrow.use; 918 919 PetscFunctionBegin; 920 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 921 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 922 923 v = a->a; 924 if (usecprow){ 925 mbs = a->compressedrow.nrows; 926 ii = a->compressedrow.i; 927 ridx = a->compressedrow.rindex; 928 } else { 929 mbs = a->mbs; 930 ii = a->i; 931 z = zarray; 932 } 933 934 for (i=0; i<mbs; i++) { 935 n = ii[i+1] - ii[i]; 936 idx = ij + ii[i]; 937 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 938 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 939 940 nonzerorow += (n>0); 941 for (j=0; j<n; j++) { 942 xb = x + 15*(idx[j]); 943 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 944 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 945 946 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 947 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 948 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 949 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 950 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 951 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 952 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 953 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 954 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 955 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 956 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 957 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 958 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 959 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 960 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 961 v += 225; 962 } 963 if (usecprow) z = zarray + 15*ridx[i]; 964 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 965 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 966 967 if (!usecprow) z += 15; 968 } 969 970 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 971 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 972 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 973 PetscFunctionReturn(0); 974 } 975 976 977 /* 978 This will not work with MatScalar == float because it calls the BLAS 979 */ 980 #undef __FUNCT__ 981 #define __FUNCT__ "MatMult_SeqBAIJ_N" 982 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 983 { 984 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 985 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 986 MatScalar *v; 987 PetscErrorCode ierr; 988 PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 989 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 990 PetscBool usecprow=a->compressedrow.use; 991 992 PetscFunctionBegin; 993 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 994 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 995 996 idx = a->j; 997 v = a->a; 998 if (usecprow){ 999 mbs = a->compressedrow.nrows; 1000 ii = a->compressedrow.i; 1001 ridx = a->compressedrow.rindex; 1002 } else { 1003 mbs = a->mbs; 1004 ii = a->i; 1005 z = zarray; 1006 } 1007 1008 if (!a->mult_work) { 1009 k = PetscMax(A->rmap->n,A->cmap->n); 1010 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1011 } 1012 work = a->mult_work; 1013 for (i=0; i<mbs; i++) { 1014 n = ii[1] - ii[0]; ii++; 1015 ncols = n*bs; 1016 workt = work; 1017 nonzerorow += (n>0); 1018 for (j=0; j<n; j++) { 1019 xb = x + bs*(*idx++); 1020 for (k=0; k<bs; k++) workt[k] = xb[k]; 1021 workt += bs; 1022 } 1023 if (usecprow) z = zarray + bs*ridx[i]; 1024 Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1025 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1026 v += n*bs2; 1027 if (!usecprow) z += bs; 1028 } 1029 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1030 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1031 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1032 PetscFunctionReturn(0); 1033 } 1034 1035 #undef __FUNCT__ 1036 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1037 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1038 { 1039 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1040 const PetscScalar *x; 1041 PetscScalar *y,*z,sum; 1042 const MatScalar *v; 1043 PetscErrorCode ierr; 1044 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1045 const PetscInt *idx,*ii; 1046 PetscBool usecprow=a->compressedrow.use; 1047 1048 PetscFunctionBegin; 1049 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1050 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1051 if (zz != yy) { 1052 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1053 } else { 1054 z = y; 1055 } 1056 1057 idx = a->j; 1058 v = a->a; 1059 if (usecprow){ 1060 if (zz != yy){ 1061 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1062 } 1063 mbs = a->compressedrow.nrows; 1064 ii = a->compressedrow.i; 1065 ridx = a->compressedrow.rindex; 1066 } else { 1067 ii = a->i; 1068 } 1069 1070 for (i=0; i<mbs; i++) { 1071 n = ii[1] - ii[0]; 1072 ii++; 1073 if (!usecprow){ 1074 nonzerorow += (n>0); 1075 sum = y[i]; 1076 } else { 1077 sum = y[ridx[i]]; 1078 } 1079 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1080 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1081 PetscSparseDensePlusDot(sum,x,v,idx,n); 1082 v += n; 1083 idx += n; 1084 if (usecprow){ 1085 z[ridx[i]] = sum; 1086 } else { 1087 z[i] = sum; 1088 } 1089 } 1090 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1091 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1092 if (zz != yy) { 1093 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1094 } 1095 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1096 PetscFunctionReturn(0); 1097 } 1098 1099 #undef __FUNCT__ 1100 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1101 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1102 { 1103 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1104 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1105 PetscScalar x1,x2,*yarray,*zarray; 1106 MatScalar *v; 1107 PetscErrorCode ierr; 1108 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1109 PetscBool usecprow=a->compressedrow.use; 1110 1111 PetscFunctionBegin; 1112 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1113 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1114 if (zz != yy) { 1115 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1116 } else { 1117 zarray = yarray; 1118 } 1119 1120 idx = a->j; 1121 v = a->a; 1122 if (usecprow){ 1123 if (zz != yy){ 1124 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1125 } 1126 mbs = a->compressedrow.nrows; 1127 ii = a->compressedrow.i; 1128 ridx = a->compressedrow.rindex; 1129 if (zz != yy){ 1130 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1131 } 1132 } else { 1133 ii = a->i; 1134 y = yarray; 1135 z = zarray; 1136 } 1137 1138 for (i=0; i<mbs; i++) { 1139 n = ii[1] - ii[0]; ii++; 1140 if (usecprow){ 1141 z = zarray + 2*ridx[i]; 1142 y = yarray + 2*ridx[i]; 1143 } 1144 sum1 = y[0]; sum2 = y[1]; 1145 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1146 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1147 for (j=0; j<n; j++) { 1148 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1149 sum1 += v[0]*x1 + v[2]*x2; 1150 sum2 += v[1]*x1 + v[3]*x2; 1151 v += 4; 1152 } 1153 z[0] = sum1; z[1] = sum2; 1154 if (!usecprow){ 1155 z += 2; y += 2; 1156 } 1157 } 1158 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1159 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1160 if (zz != yy) { 1161 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1162 } 1163 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1164 PetscFunctionReturn(0); 1165 } 1166 1167 #undef __FUNCT__ 1168 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1169 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1170 { 1171 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1172 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1173 MatScalar *v; 1174 PetscErrorCode ierr; 1175 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1176 PetscBool usecprow=a->compressedrow.use; 1177 1178 PetscFunctionBegin; 1179 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1180 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1181 if (zz != yy) { 1182 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1183 } else { 1184 zarray = yarray; 1185 } 1186 1187 idx = a->j; 1188 v = a->a; 1189 if (usecprow){ 1190 if (zz != yy){ 1191 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1192 } 1193 mbs = a->compressedrow.nrows; 1194 ii = a->compressedrow.i; 1195 ridx = a->compressedrow.rindex; 1196 } else { 1197 ii = a->i; 1198 y = yarray; 1199 z = zarray; 1200 } 1201 1202 for (i=0; i<mbs; i++) { 1203 n = ii[1] - ii[0]; ii++; 1204 if (usecprow){ 1205 z = zarray + 3*ridx[i]; 1206 y = yarray + 3*ridx[i]; 1207 } 1208 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1209 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1210 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1211 for (j=0; j<n; j++) { 1212 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1213 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1214 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1215 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1216 v += 9; 1217 } 1218 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1219 if (!usecprow){ 1220 z += 3; y += 3; 1221 } 1222 } 1223 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1224 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1225 if (zz != yy) { 1226 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1227 } 1228 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 #undef __FUNCT__ 1233 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1234 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1235 { 1236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1237 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1238 MatScalar *v; 1239 PetscErrorCode ierr; 1240 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1241 PetscBool usecprow=a->compressedrow.use; 1242 1243 PetscFunctionBegin; 1244 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1245 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1246 if (zz != yy) { 1247 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1248 } else { 1249 zarray = yarray; 1250 } 1251 1252 idx = a->j; 1253 v = a->a; 1254 if (usecprow){ 1255 if (zz != yy){ 1256 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1257 } 1258 mbs = a->compressedrow.nrows; 1259 ii = a->compressedrow.i; 1260 ridx = a->compressedrow.rindex; 1261 } else { 1262 ii = a->i; 1263 y = yarray; 1264 z = zarray; 1265 } 1266 1267 for (i=0; i<mbs; i++) { 1268 n = ii[1] - ii[0]; ii++; 1269 if (usecprow){ 1270 z = zarray + 4*ridx[i]; 1271 y = yarray + 4*ridx[i]; 1272 } 1273 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1274 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1275 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1276 for (j=0; j<n; j++) { 1277 xb = x + 4*(*idx++); 1278 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1279 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1280 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1281 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1282 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1283 v += 16; 1284 } 1285 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1286 if (!usecprow){ 1287 z += 4; y += 4; 1288 } 1289 } 1290 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1291 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1292 if (zz != yy) { 1293 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1294 } 1295 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1296 PetscFunctionReturn(0); 1297 } 1298 1299 #undef __FUNCT__ 1300 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1301 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1302 { 1303 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1304 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1305 PetscScalar *yarray,*zarray; 1306 MatScalar *v; 1307 PetscErrorCode ierr; 1308 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1309 PetscBool usecprow=a->compressedrow.use; 1310 1311 PetscFunctionBegin; 1312 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1313 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1314 if (zz != yy) { 1315 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1316 } else { 1317 zarray = yarray; 1318 } 1319 1320 idx = a->j; 1321 v = a->a; 1322 if (usecprow){ 1323 if (zz != yy){ 1324 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1325 } 1326 mbs = a->compressedrow.nrows; 1327 ii = a->compressedrow.i; 1328 ridx = a->compressedrow.rindex; 1329 } else { 1330 ii = a->i; 1331 y = yarray; 1332 z = zarray; 1333 } 1334 1335 for (i=0; i<mbs; i++) { 1336 n = ii[1] - ii[0]; ii++; 1337 if (usecprow){ 1338 z = zarray + 5*ridx[i]; 1339 y = yarray + 5*ridx[i]; 1340 } 1341 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1342 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1343 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1344 for (j=0; j<n; j++) { 1345 xb = x + 5*(*idx++); 1346 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1347 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1348 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1349 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1350 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1351 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1352 v += 25; 1353 } 1354 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1355 if (!usecprow){ 1356 z += 5; y += 5; 1357 } 1358 } 1359 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1360 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1361 if (zz != yy) { 1362 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1363 } 1364 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1365 PetscFunctionReturn(0); 1366 } 1367 #undef __FUNCT__ 1368 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1369 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1370 { 1371 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1372 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1373 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1374 MatScalar *v; 1375 PetscErrorCode ierr; 1376 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1377 PetscBool usecprow=a->compressedrow.use; 1378 1379 PetscFunctionBegin; 1380 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1381 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1382 if (zz != yy) { 1383 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1384 } else { 1385 zarray = yarray; 1386 } 1387 1388 idx = a->j; 1389 v = a->a; 1390 if (usecprow){ 1391 if (zz != yy){ 1392 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1393 } 1394 mbs = a->compressedrow.nrows; 1395 ii = a->compressedrow.i; 1396 ridx = a->compressedrow.rindex; 1397 } else { 1398 ii = a->i; 1399 y = yarray; 1400 z = zarray; 1401 } 1402 1403 for (i=0; i<mbs; i++) { 1404 n = ii[1] - ii[0]; ii++; 1405 if (usecprow){ 1406 z = zarray + 6*ridx[i]; 1407 y = yarray + 6*ridx[i]; 1408 } 1409 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1410 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1411 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1412 for (j=0; j<n; j++) { 1413 xb = x + 6*(*idx++); 1414 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1415 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1416 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1417 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1418 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1419 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1420 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1421 v += 36; 1422 } 1423 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1424 if (!usecprow){ 1425 z += 6; y += 6; 1426 } 1427 } 1428 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1429 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1430 if (zz != yy) { 1431 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1432 } 1433 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1434 PetscFunctionReturn(0); 1435 } 1436 1437 #undef __FUNCT__ 1438 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1439 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1440 { 1441 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1442 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1443 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1444 MatScalar *v; 1445 PetscErrorCode ierr; 1446 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1447 PetscBool usecprow=a->compressedrow.use; 1448 1449 PetscFunctionBegin; 1450 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1451 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1452 if (zz != yy) { 1453 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1454 } else { 1455 zarray = yarray; 1456 } 1457 1458 idx = a->j; 1459 v = a->a; 1460 if (usecprow){ 1461 if (zz != yy){ 1462 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1463 } 1464 mbs = a->compressedrow.nrows; 1465 ii = a->compressedrow.i; 1466 ridx = a->compressedrow.rindex; 1467 } else { 1468 ii = a->i; 1469 y = yarray; 1470 z = zarray; 1471 } 1472 1473 for (i=0; i<mbs; i++) { 1474 n = ii[1] - ii[0]; ii++; 1475 if (usecprow){ 1476 z = zarray + 7*ridx[i]; 1477 y = yarray + 7*ridx[i]; 1478 } 1479 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1480 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1481 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1482 for (j=0; j<n; j++) { 1483 xb = x + 7*(*idx++); 1484 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1485 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1486 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1487 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1488 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1489 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1490 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1491 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1492 v += 49; 1493 } 1494 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1495 if (!usecprow){ 1496 z += 7; y += 7; 1497 } 1498 } 1499 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1500 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1501 if (zz != yy) { 1502 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1503 } 1504 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1505 PetscFunctionReturn(0); 1506 } 1507 1508 #undef __FUNCT__ 1509 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1510 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1511 { 1512 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1513 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1514 MatScalar *v; 1515 PetscErrorCode ierr; 1516 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1517 PetscInt ncols,k,*ridx=PETSC_NULL; 1518 PetscBool usecprow=a->compressedrow.use; 1519 1520 PetscFunctionBegin; 1521 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1522 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1523 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1524 1525 idx = a->j; 1526 v = a->a; 1527 if (usecprow){ 1528 mbs = a->compressedrow.nrows; 1529 ii = a->compressedrow.i; 1530 ridx = a->compressedrow.rindex; 1531 } else { 1532 mbs = a->mbs; 1533 ii = a->i; 1534 z = zarray; 1535 } 1536 1537 if (!a->mult_work) { 1538 k = PetscMax(A->rmap->n,A->cmap->n); 1539 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1540 } 1541 work = a->mult_work; 1542 for (i=0; i<mbs; i++) { 1543 n = ii[1] - ii[0]; ii++; 1544 ncols = n*bs; 1545 workt = work; 1546 for (j=0; j<n; j++) { 1547 xb = x + bs*(*idx++); 1548 for (k=0; k<bs; k++) workt[k] = xb[k]; 1549 workt += bs; 1550 } 1551 if (usecprow) z = zarray + bs*ridx[i]; 1552 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1553 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1554 v += n*bs2; 1555 if (!usecprow){ 1556 z += bs; 1557 } 1558 } 1559 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1560 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1561 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 #undef __FUNCT__ 1566 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1567 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1568 { 1569 PetscScalar zero = 0.0; 1570 PetscErrorCode ierr; 1571 1572 PetscFunctionBegin; 1573 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1574 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1575 PetscFunctionReturn(0); 1576 } 1577 1578 #undef __FUNCT__ 1579 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1580 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1581 { 1582 PetscScalar zero = 0.0; 1583 PetscErrorCode ierr; 1584 1585 PetscFunctionBegin; 1586 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1587 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1588 PetscFunctionReturn(0); 1589 } 1590 1591 #undef __FUNCT__ 1592 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1593 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1594 1595 { 1596 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1597 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1598 MatScalar *v; 1599 PetscErrorCode ierr; 1600 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1601 Mat_CompressedRow cprow = a->compressedrow; 1602 PetscBool usecprow=cprow.use; 1603 1604 PetscFunctionBegin; 1605 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1606 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1607 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1608 1609 idx = a->j; 1610 v = a->a; 1611 if (usecprow){ 1612 mbs = cprow.nrows; 1613 ii = cprow.i; 1614 ridx = cprow.rindex; 1615 } else { 1616 mbs=a->mbs; 1617 ii = a->i; 1618 xb = x; 1619 } 1620 1621 switch (bs) { 1622 case 1: 1623 for (i=0; i<mbs; i++) { 1624 if (usecprow) xb = x + ridx[i]; 1625 x1 = xb[0]; 1626 ib = idx + ii[0]; 1627 n = ii[1] - ii[0]; ii++; 1628 for (j=0; j<n; j++) { 1629 rval = ib[j]; 1630 z[rval] += PetscConj(*v) * x1; 1631 v++; 1632 } 1633 if (!usecprow) xb++; 1634 } 1635 break; 1636 case 2: 1637 for (i=0; i<mbs; i++) { 1638 if (usecprow) xb = x + 2*ridx[i]; 1639 x1 = xb[0]; x2 = xb[1]; 1640 ib = idx + ii[0]; 1641 n = ii[1] - ii[0]; ii++; 1642 for (j=0; j<n; j++) { 1643 rval = ib[j]*2; 1644 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1645 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1646 v += 4; 1647 } 1648 if (!usecprow) xb += 2; 1649 } 1650 break; 1651 case 3: 1652 for (i=0; i<mbs; i++) { 1653 if (usecprow) xb = x + 3*ridx[i]; 1654 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1655 ib = idx + ii[0]; 1656 n = ii[1] - ii[0]; ii++; 1657 for (j=0; j<n; j++) { 1658 rval = ib[j]*3; 1659 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1660 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1661 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1662 v += 9; 1663 } 1664 if (!usecprow) xb += 3; 1665 } 1666 break; 1667 case 4: 1668 for (i=0; i<mbs; i++) { 1669 if (usecprow) xb = x + 4*ridx[i]; 1670 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1671 ib = idx + ii[0]; 1672 n = ii[1] - ii[0]; ii++; 1673 for (j=0; j<n; j++) { 1674 rval = ib[j]*4; 1675 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1676 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1677 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1678 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1679 v += 16; 1680 } 1681 if (!usecprow) xb += 4; 1682 } 1683 break; 1684 case 5: 1685 for (i=0; i<mbs; i++) { 1686 if (usecprow) xb = x + 5*ridx[i]; 1687 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1688 x4 = xb[3]; x5 = xb[4]; 1689 ib = idx + ii[0]; 1690 n = ii[1] - ii[0]; ii++; 1691 for (j=0; j<n; j++) { 1692 rval = ib[j]*5; 1693 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1694 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1695 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1696 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1697 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1698 v += 25; 1699 } 1700 if (!usecprow) xb += 5; 1701 } 1702 break; 1703 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1704 PetscInt ncols,k; 1705 PetscScalar *work,*workt,*xtmp; 1706 1707 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1708 if (!a->mult_work) { 1709 k = PetscMax(A->rmap->n,A->cmap->n); 1710 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1711 } 1712 work = a->mult_work; 1713 xtmp = x; 1714 for (i=0; i<mbs; i++) { 1715 n = ii[1] - ii[0]; ii++; 1716 ncols = n*bs; 1717 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1718 if (usecprow) { 1719 xtmp = x + bs*ridx[i]; 1720 } 1721 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1722 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1723 v += n*bs2; 1724 if (!usecprow) xtmp += bs; 1725 workt = work; 1726 for (j=0; j<n; j++) { 1727 zb = z + bs*(*idx++); 1728 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1729 workt += bs; 1730 } 1731 } 1732 } 1733 } 1734 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1735 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1736 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1737 PetscFunctionReturn(0); 1738 } 1739 1740 #undef __FUNCT__ 1741 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1742 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1743 { 1744 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1745 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1746 MatScalar *v; 1747 PetscErrorCode ierr; 1748 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1749 Mat_CompressedRow cprow = a->compressedrow; 1750 PetscBool usecprow=cprow.use; 1751 1752 PetscFunctionBegin; 1753 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1754 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1755 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1756 1757 idx = a->j; 1758 v = a->a; 1759 if (usecprow){ 1760 mbs = cprow.nrows; 1761 ii = cprow.i; 1762 ridx = cprow.rindex; 1763 } else { 1764 mbs=a->mbs; 1765 ii = a->i; 1766 xb = x; 1767 } 1768 1769 switch (bs) { 1770 case 1: 1771 for (i=0; i<mbs; i++) { 1772 if (usecprow) xb = x + ridx[i]; 1773 x1 = xb[0]; 1774 ib = idx + ii[0]; 1775 n = ii[1] - ii[0]; ii++; 1776 for (j=0; j<n; j++) { 1777 rval = ib[j]; 1778 z[rval] += *v * x1; 1779 v++; 1780 } 1781 if (!usecprow) xb++; 1782 } 1783 break; 1784 case 2: 1785 for (i=0; i<mbs; i++) { 1786 if (usecprow) xb = x + 2*ridx[i]; 1787 x1 = xb[0]; x2 = xb[1]; 1788 ib = idx + ii[0]; 1789 n = ii[1] - ii[0]; ii++; 1790 for (j=0; j<n; j++) { 1791 rval = ib[j]*2; 1792 z[rval++] += v[0]*x1 + v[1]*x2; 1793 z[rval++] += v[2]*x1 + v[3]*x2; 1794 v += 4; 1795 } 1796 if (!usecprow) xb += 2; 1797 } 1798 break; 1799 case 3: 1800 for (i=0; i<mbs; i++) { 1801 if (usecprow) xb = x + 3*ridx[i]; 1802 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1803 ib = idx + ii[0]; 1804 n = ii[1] - ii[0]; ii++; 1805 for (j=0; j<n; j++) { 1806 rval = ib[j]*3; 1807 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1808 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1809 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1810 v += 9; 1811 } 1812 if (!usecprow) xb += 3; 1813 } 1814 break; 1815 case 4: 1816 for (i=0; i<mbs; i++) { 1817 if (usecprow) xb = x + 4*ridx[i]; 1818 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1819 ib = idx + ii[0]; 1820 n = ii[1] - ii[0]; ii++; 1821 for (j=0; j<n; j++) { 1822 rval = ib[j]*4; 1823 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1824 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1825 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1826 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1827 v += 16; 1828 } 1829 if (!usecprow) xb += 4; 1830 } 1831 break; 1832 case 5: 1833 for (i=0; i<mbs; i++) { 1834 if (usecprow) xb = x + 5*ridx[i]; 1835 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1836 x4 = xb[3]; x5 = xb[4]; 1837 ib = idx + ii[0]; 1838 n = ii[1] - ii[0]; ii++; 1839 for (j=0; j<n; j++) { 1840 rval = ib[j]*5; 1841 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1842 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1843 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1844 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1845 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1846 v += 25; 1847 } 1848 if (!usecprow) xb += 5; 1849 } 1850 break; 1851 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1852 PetscInt ncols,k; 1853 PetscScalar *work,*workt,*xtmp; 1854 1855 if (!a->mult_work) { 1856 k = PetscMax(A->rmap->n,A->cmap->n); 1857 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1858 } 1859 work = a->mult_work; 1860 xtmp = x; 1861 for (i=0; i<mbs; i++) { 1862 n = ii[1] - ii[0]; ii++; 1863 ncols = n*bs; 1864 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1865 if (usecprow) { 1866 xtmp = x + bs*ridx[i]; 1867 } 1868 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1869 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1870 v += n*bs2; 1871 if (!usecprow) xtmp += bs; 1872 workt = work; 1873 for (j=0; j<n; j++) { 1874 zb = z + bs*(*idx++); 1875 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1876 workt += bs; 1877 } 1878 } 1879 } 1880 } 1881 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1882 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1883 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1884 PetscFunctionReturn(0); 1885 } 1886 1887 #undef __FUNCT__ 1888 #define __FUNCT__ "MatScale_SeqBAIJ" 1889 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1890 { 1891 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1892 PetscInt totalnz = a->bs2*a->nz; 1893 PetscScalar oalpha = alpha; 1894 PetscErrorCode ierr; 1895 PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz); 1896 1897 PetscFunctionBegin; 1898 BLASscal_(&tnz,&oalpha,a->a,&one); 1899 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1900 PetscFunctionReturn(0); 1901 } 1902 1903 #undef __FUNCT__ 1904 #define __FUNCT__ "MatNorm_SeqBAIJ" 1905 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1906 { 1907 PetscErrorCode ierr; 1908 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1909 MatScalar *v = a->a; 1910 PetscReal sum = 0.0; 1911 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1912 1913 PetscFunctionBegin; 1914 if (type == NORM_FROBENIUS) { 1915 for (i=0; i< bs2*nz; i++) { 1916 #if defined(PETSC_USE_COMPLEX) 1917 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1918 #else 1919 sum += (*v)*(*v); v++; 1920 #endif 1921 } 1922 *norm = sqrt(sum); 1923 } else if (type == NORM_1) { /* maximum column sum */ 1924 PetscReal *tmp; 1925 PetscInt *bcol = a->j; 1926 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1927 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1928 for (i=0; i<nz; i++){ 1929 for (j=0; j<bs; j++){ 1930 k1 = bs*(*bcol) + j; /* column index */ 1931 for (k=0; k<bs; k++){ 1932 tmp[k1] += PetscAbsScalar(*v); v++; 1933 } 1934 } 1935 bcol++; 1936 } 1937 *norm = 0.0; 1938 for (j=0; j<A->cmap->n; j++) { 1939 if (tmp[j] > *norm) *norm = tmp[j]; 1940 } 1941 ierr = PetscFree(tmp);CHKERRQ(ierr); 1942 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1943 *norm = 0.0; 1944 for (k=0; k<bs; k++) { 1945 for (j=0; j<a->mbs; j++) { 1946 v = a->a + bs2*a->i[j] + k; 1947 sum = 0.0; 1948 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1949 for (k1=0; k1<bs; k1++){ 1950 sum += PetscAbsScalar(*v); 1951 v += bs; 1952 } 1953 } 1954 if (sum > *norm) *norm = sum; 1955 } 1956 } 1957 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1958 PetscFunctionReturn(0); 1959 } 1960 1961 1962 #undef __FUNCT__ 1963 #define __FUNCT__ "MatEqual_SeqBAIJ" 1964 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1965 { 1966 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1967 PetscErrorCode ierr; 1968 1969 PetscFunctionBegin; 1970 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1971 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1972 *flg = PETSC_FALSE; 1973 PetscFunctionReturn(0); 1974 } 1975 1976 /* if the a->i are the same */ 1977 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1978 if (!*flg) { 1979 PetscFunctionReturn(0); 1980 } 1981 1982 /* if a->j are the same */ 1983 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1984 if (!*flg) { 1985 PetscFunctionReturn(0); 1986 } 1987 /* if a->a are the same */ 1988 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1989 PetscFunctionReturn(0); 1990 1991 } 1992 1993 #undef __FUNCT__ 1994 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1995 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1996 { 1997 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1998 PetscErrorCode ierr; 1999 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2000 PetscScalar *x,zero = 0.0; 2001 MatScalar *aa,*aa_j; 2002 2003 PetscFunctionBegin; 2004 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2005 bs = A->rmap->bs; 2006 aa = a->a; 2007 ai = a->i; 2008 aj = a->j; 2009 ambs = a->mbs; 2010 bs2 = a->bs2; 2011 2012 ierr = VecSet(v,zero);CHKERRQ(ierr); 2013 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2014 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2015 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2016 for (i=0; i<ambs; i++) { 2017 for (j=ai[i]; j<ai[i+1]; j++) { 2018 if (aj[j] == i) { 2019 row = i*bs; 2020 aa_j = aa+j*bs2; 2021 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2022 break; 2023 } 2024 } 2025 } 2026 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2027 PetscFunctionReturn(0); 2028 } 2029 2030 #undef __FUNCT__ 2031 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2032 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2033 { 2034 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2035 const PetscScalar *l,*r,*li,*ri; 2036 PetscScalar x; 2037 MatScalar *aa, *v; 2038 PetscErrorCode ierr; 2039 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2040 const PetscInt *ai,*aj; 2041 2042 PetscFunctionBegin; 2043 ai = a->i; 2044 aj = a->j; 2045 aa = a->a; 2046 m = A->rmap->n; 2047 n = A->cmap->n; 2048 bs = A->rmap->bs; 2049 mbs = a->mbs; 2050 bs2 = a->bs2; 2051 if (ll) { 2052 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2053 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2054 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2055 for (i=0; i<mbs; i++) { /* for each block row */ 2056 M = ai[i+1] - ai[i]; 2057 li = l + i*bs; 2058 v = aa + bs2*ai[i]; 2059 for (j=0; j<M; j++) { /* for each block */ 2060 for (k=0; k<bs2; k++) { 2061 (*v++) *= li[k%bs]; 2062 } 2063 } 2064 } 2065 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2066 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2067 } 2068 2069 if (rr) { 2070 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2071 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2072 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2073 for (i=0; i<mbs; i++) { /* for each block row */ 2074 iai = ai[i]; 2075 M = ai[i+1] - iai; 2076 v = aa + bs2*iai; 2077 for (j=0; j<M; j++) { /* for each block */ 2078 ri = r + bs*aj[iai+j]; 2079 for (k=0; k<bs; k++) { 2080 x = ri[k]; 2081 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2082 v += bs; 2083 } 2084 } 2085 } 2086 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2087 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2088 } 2089 PetscFunctionReturn(0); 2090 } 2091 2092 2093 #undef __FUNCT__ 2094 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2095 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2096 { 2097 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2098 2099 PetscFunctionBegin; 2100 info->block_size = a->bs2; 2101 info->nz_allocated = a->bs2*a->maxnz; 2102 info->nz_used = a->bs2*a->nz; 2103 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2104 info->assemblies = A->num_ass; 2105 info->mallocs = A->info.mallocs; 2106 info->memory = ((PetscObject)A)->mem; 2107 if (A->factortype) { 2108 info->fill_ratio_given = A->info.fill_ratio_given; 2109 info->fill_ratio_needed = A->info.fill_ratio_needed; 2110 info->factor_mallocs = A->info.factor_mallocs; 2111 } else { 2112 info->fill_ratio_given = 0; 2113 info->fill_ratio_needed = 0; 2114 info->factor_mallocs = 0; 2115 } 2116 PetscFunctionReturn(0); 2117 } 2118 2119 2120 #undef __FUNCT__ 2121 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2122 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2123 { 2124 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2125 PetscErrorCode ierr; 2126 2127 PetscFunctionBegin; 2128 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2129 PetscFunctionReturn(0); 2130 } 2131