1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 9 { 10 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 11 PetscErrorCode ierr; 12 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 13 const PetscInt *idx; 14 PetscBT table_out,table_in; 15 16 PetscFunctionBegin; 17 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->rmap->bs; 22 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 23 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 24 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 25 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 26 27 for (i=0; i<is_max; i++) { /* for each is */ 28 isz = 0; 29 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 30 31 /* Extract the indices, assume there can be duplicate entries */ 32 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 33 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 34 35 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 36 bcol_max = 0; 37 for (j=0; j<n; ++j) { 38 brow = idx[j]/bs; /* convert the indices into block indices */ 39 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 40 if (!PetscBTLookupSet(table_out,brow)) { 41 nidx[isz++] = brow; 42 if (bcol_max < brow) bcol_max = brow; 43 } 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap */ 50 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 51 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 52 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 53 54 n = isz; /* length of the updated is[i] */ 55 for (brow=0; brow<mbs; brow++) { 56 start = ai[brow]; end = ai[brow+1]; 57 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58 for (l = start; l<end; l++) { 59 bcol = aj[l]; 60 if (!PetscBTLookupSet(table_out,bcol)) { 61 nidx[isz++] = bcol; 62 if (bcol_max < bcol) bcol_max = bcol; 63 } 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end; l++) { 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table_in,bcol)) { 72 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 94 Zero some ops' to avoid invalid usse */ 95 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 96 { 97 PetscErrorCode ierr; 98 99 PetscFunctionBegin; 100 ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 101 Bseq->ops->mult = 0; 102 Bseq->ops->multadd = 0; 103 Bseq->ops->multtranspose = 0; 104 Bseq->ops->multtransposeadd = 0; 105 Bseq->ops->lufactor = 0; 106 Bseq->ops->choleskyfactor = 0; 107 Bseq->ops->lufactorsymbolic = 0; 108 Bseq->ops->choleskyfactorsymbolic = 0; 109 Bseq->ops->getinertia = 0; 110 PetscFunctionReturn(0); 111 } 112 113 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 114 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 115 { 116 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 117 PetscErrorCode ierr; 118 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 119 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 120 const PetscInt *irow,*icol; 121 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 122 PetscInt *aj = a->j,*ai = a->i; 123 MatScalar *mat_a; 124 Mat C; 125 PetscBool flag; 126 127 PetscFunctionBegin; 128 129 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 130 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 131 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 132 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 133 134 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 135 ssmap = smap; 136 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 137 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 138 /* determine lens of each row */ 139 for (i=0; i<nrows; i++) { 140 kstart = ai[irow[i]]; 141 kend = kstart + a->ilen[irow[i]]; 142 lens[i] = 0; 143 for (k=kstart; k<kend; k++) { 144 if (ssmap[aj[k]]) lens[i]++; 145 } 146 } 147 /* Create and fill new matrix */ 148 if (scall == MAT_REUSE_MATRIX) { 149 c = (Mat_SeqSBAIJ*)((*B)->data); 150 151 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 152 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 153 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 154 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 155 C = *B; 156 } else { 157 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 158 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 159 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 160 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 161 } 162 c = (Mat_SeqSBAIJ*)(C->data); 163 for (i=0; i<nrows; i++) { 164 row = irow[i]; 165 kstart = ai[row]; 166 kend = kstart + a->ilen[row]; 167 mat_i = c->i[i]; 168 mat_j = c->j + mat_i; 169 mat_a = c->a + mat_i*bs2; 170 mat_ilen = c->ilen + i; 171 for (k=kstart; k<kend; k++) { 172 if ((tcol=ssmap[a->j[k]])) { 173 *mat_j++ = tcol - 1; 174 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 175 mat_a += bs2; 176 (*mat_ilen)++; 177 } 178 } 179 } 180 /* sort */ 181 { 182 MatScalar *work; 183 184 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 185 for (i=0; i<nrows; i++) { 186 PetscInt ilen; 187 mat_i = c->i[i]; 188 mat_j = c->j + mat_i; 189 mat_a = c->a + mat_i*bs2; 190 ilen = c->ilen[i]; 191 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 192 } 193 ierr = PetscFree(work);CHKERRQ(ierr); 194 } 195 196 /* Free work space */ 197 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 198 ierr = PetscFree(smap);CHKERRQ(ierr); 199 ierr = PetscFree(lens);CHKERRQ(ierr); 200 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 201 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 202 203 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 204 *B = C; 205 PetscFunctionReturn(0); 206 } 207 208 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 209 { 210 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 211 IS is1,is2; 212 PetscErrorCode ierr; 213 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 214 const PetscInt *irow,*icol; 215 216 PetscFunctionBegin; 217 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 218 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 219 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 220 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 221 222 /* Verify if the indices corespond to each element in a block 223 and form the IS with compressed IS */ 224 maxmnbs = PetscMax(a->mbs,a->nbs); 225 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 226 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 227 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 228 for (i=0; i<a->mbs; i++) { 229 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 230 } 231 count = 0; 232 for (i=0; i<nrows; i++) { 233 PetscInt j = irow[i] / bs; 234 if ((vary[j]--)==bs) iary[count++] = j; 235 } 236 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 237 238 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 239 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 240 for (i=0; i<a->nbs; i++) { 241 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 242 } 243 count = 0; 244 for (i=0; i<ncols; i++) { 245 PetscInt j = icol[i] / bs; 246 if ((vary[j]--)==bs) iary[count++] = j; 247 } 248 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 249 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 250 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 251 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 252 253 ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 254 ierr = ISDestroy(&is1);CHKERRQ(ierr); 255 ierr = ISDestroy(&is2);CHKERRQ(ierr); 256 257 if (isrow != iscol) { 258 PetscBool isequal; 259 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 260 if (!isequal) { 261 ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 262 } 263 } 264 PetscFunctionReturn(0); 265 } 266 267 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 268 { 269 PetscErrorCode ierr; 270 PetscInt i; 271 272 PetscFunctionBegin; 273 if (scall == MAT_INITIAL_MATRIX) { 274 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 275 } 276 277 for (i=0; i<n; i++) { 278 ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 279 } 280 PetscFunctionReturn(0); 281 } 282 283 /* -------------------------------------------------------*/ 284 /* Should check that shapes of vectors and matrices match */ 285 /* -------------------------------------------------------*/ 286 287 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 288 { 289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 290 PetscScalar *z,x1,x2,zero=0.0; 291 const PetscScalar *x,*xb; 292 const MatScalar *v; 293 PetscErrorCode ierr; 294 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 295 const PetscInt *aj=a->j,*ai=a->i,*ib; 296 PetscInt nonzerorow=0; 297 298 PetscFunctionBegin; 299 ierr = VecSet(zz,zero);CHKERRQ(ierr); 300 if (!a->nz) PetscFunctionReturn(0); 301 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 302 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 303 304 v = a->a; 305 xb = x; 306 307 for (i=0; i<mbs; i++) { 308 n = ai[1] - ai[0]; /* length of i_th block row of A */ 309 x1 = xb[0]; x2 = xb[1]; 310 ib = aj + *ai; 311 jmin = 0; 312 nonzerorow += (n>0); 313 if (*ib == i) { /* (diag of A)*x */ 314 z[2*i] += v[0]*x1 + v[2]*x2; 315 z[2*i+1] += v[2]*x1 + v[3]*x2; 316 v += 4; jmin++; 317 } 318 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 319 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 320 for (j=jmin; j<n; j++) { 321 /* (strict lower triangular part of A)*x */ 322 cval = ib[j]*2; 323 z[cval] += v[0]*x1 + v[1]*x2; 324 z[cval+1] += v[2]*x1 + v[3]*x2; 325 /* (strict upper triangular part of A)*x */ 326 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 327 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 328 v += 4; 329 } 330 xb +=2; ai++; 331 } 332 333 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 334 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 335 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 336 PetscFunctionReturn(0); 337 } 338 339 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 340 { 341 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 342 PetscScalar *z,x1,x2,x3,zero=0.0; 343 const PetscScalar *x,*xb; 344 const MatScalar *v; 345 PetscErrorCode ierr; 346 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 347 const PetscInt *aj = a->j,*ai = a->i,*ib; 348 PetscInt nonzerorow=0; 349 350 PetscFunctionBegin; 351 ierr = VecSet(zz,zero);CHKERRQ(ierr); 352 if (!a->nz) PetscFunctionReturn(0); 353 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 354 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 355 356 v = a->a; 357 xb = x; 358 359 for (i=0; i<mbs; i++) { 360 n = ai[1] - ai[0]; /* length of i_th block row of A */ 361 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 362 ib = aj + *ai; 363 jmin = 0; 364 nonzerorow += (n>0); 365 if (*ib == i) { /* (diag of A)*x */ 366 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 367 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 368 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 369 v += 9; jmin++; 370 } 371 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 372 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 373 for (j=jmin; j<n; j++) { 374 /* (strict lower triangular part of A)*x */ 375 cval = ib[j]*3; 376 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 377 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 378 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 379 /* (strict upper triangular part of A)*x */ 380 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 381 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 382 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 383 v += 9; 384 } 385 xb +=3; ai++; 386 } 387 388 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 389 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 390 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 391 PetscFunctionReturn(0); 392 } 393 394 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 395 { 396 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 397 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 398 const PetscScalar *x,*xb; 399 const MatScalar *v; 400 PetscErrorCode ierr; 401 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 402 const PetscInt *aj = a->j,*ai = a->i,*ib; 403 PetscInt nonzerorow = 0; 404 405 PetscFunctionBegin; 406 ierr = VecSet(zz,zero);CHKERRQ(ierr); 407 if (!a->nz) PetscFunctionReturn(0); 408 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 409 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 410 411 v = a->a; 412 xb = x; 413 414 for (i=0; i<mbs; i++) { 415 n = ai[1] - ai[0]; /* length of i_th block row of A */ 416 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 417 ib = aj + *ai; 418 jmin = 0; 419 nonzerorow += (n>0); 420 if (*ib == i) { /* (diag of A)*x */ 421 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 422 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 423 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 424 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 425 v += 16; jmin++; 426 } 427 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 428 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 429 for (j=jmin; j<n; j++) { 430 /* (strict lower triangular part of A)*x */ 431 cval = ib[j]*4; 432 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 433 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 434 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 435 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 436 /* (strict upper triangular part of A)*x */ 437 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 438 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 439 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 440 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 441 v += 16; 442 } 443 xb +=4; ai++; 444 } 445 446 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 447 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 448 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 449 PetscFunctionReturn(0); 450 } 451 452 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 453 { 454 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 455 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 456 const PetscScalar *x,*xb; 457 const MatScalar *v; 458 PetscErrorCode ierr; 459 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 460 const PetscInt *aj = a->j,*ai = a->i,*ib; 461 PetscInt nonzerorow=0; 462 463 PetscFunctionBegin; 464 ierr = VecSet(zz,zero);CHKERRQ(ierr); 465 if (!a->nz) PetscFunctionReturn(0); 466 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 467 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 468 469 v = a->a; 470 xb = x; 471 472 for (i=0; i<mbs; i++) { 473 n = ai[1] - ai[0]; /* length of i_th block row of A */ 474 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 475 ib = aj + *ai; 476 jmin = 0; 477 nonzerorow += (n>0); 478 if (*ib == i) { /* (diag of A)*x */ 479 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 480 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 481 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 482 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 483 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 484 v += 25; jmin++; 485 } 486 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 487 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 488 for (j=jmin; j<n; j++) { 489 /* (strict lower triangular part of A)*x */ 490 cval = ib[j]*5; 491 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 492 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 493 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 494 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 495 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 496 /* (strict upper triangular part of A)*x */ 497 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 498 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 499 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 500 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 501 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 502 v += 25; 503 } 504 xb +=5; ai++; 505 } 506 507 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 508 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 509 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 510 PetscFunctionReturn(0); 511 } 512 513 514 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 515 { 516 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 517 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 518 const PetscScalar *x,*xb; 519 const MatScalar *v; 520 PetscErrorCode ierr; 521 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 522 const PetscInt *aj=a->j,*ai=a->i,*ib; 523 PetscInt nonzerorow=0; 524 525 PetscFunctionBegin; 526 ierr = VecSet(zz,zero);CHKERRQ(ierr); 527 if (!a->nz) PetscFunctionReturn(0); 528 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 529 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 530 531 v = a->a; 532 xb = x; 533 534 for (i=0; i<mbs; i++) { 535 n = ai[1] - ai[0]; /* length of i_th block row of A */ 536 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 537 ib = aj + *ai; 538 jmin = 0; 539 nonzerorow += (n>0); 540 if (*ib == i) { /* (diag of A)*x */ 541 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 542 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 543 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 544 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 545 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 546 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 547 v += 36; jmin++; 548 } 549 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 550 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 551 for (j=jmin; j<n; j++) { 552 /* (strict lower triangular part of A)*x */ 553 cval = ib[j]*6; 554 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 555 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 556 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 557 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 558 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 559 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 560 /* (strict upper triangular part of A)*x */ 561 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 562 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 563 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 564 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 565 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 566 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 567 v += 36; 568 } 569 xb +=6; ai++; 570 } 571 572 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 573 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 574 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 575 PetscFunctionReturn(0); 576 } 577 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 578 { 579 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 580 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 581 const PetscScalar *x,*xb; 582 const MatScalar *v; 583 PetscErrorCode ierr; 584 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 585 const PetscInt *aj=a->j,*ai=a->i,*ib; 586 PetscInt nonzerorow=0; 587 588 PetscFunctionBegin; 589 ierr = VecSet(zz,zero);CHKERRQ(ierr); 590 if (!a->nz) PetscFunctionReturn(0); 591 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 592 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 593 594 v = a->a; 595 xb = x; 596 597 for (i=0; i<mbs; i++) { 598 n = ai[1] - ai[0]; /* length of i_th block row of A */ 599 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 600 ib = aj + *ai; 601 jmin = 0; 602 nonzerorow += (n>0); 603 if (*ib == i) { /* (diag of A)*x */ 604 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 605 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 606 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 607 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 608 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 609 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 610 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 611 v += 49; jmin++; 612 } 613 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 614 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 615 for (j=jmin; j<n; j++) { 616 /* (strict lower triangular part of A)*x */ 617 cval = ib[j]*7; 618 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 619 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 620 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 621 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 622 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 623 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 624 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 625 /* (strict upper triangular part of A)*x */ 626 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 627 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 628 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 629 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 630 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 631 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 632 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 633 v += 49; 634 } 635 xb +=7; ai++; 636 } 637 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 638 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 639 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 640 PetscFunctionReturn(0); 641 } 642 643 /* 644 This will not work with MatScalar == float because it calls the BLAS 645 */ 646 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 647 { 648 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 649 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 650 const PetscScalar *x,*x_ptr,*xb; 651 const MatScalar *v; 652 PetscErrorCode ierr; 653 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 654 const PetscInt *idx,*aj,*ii; 655 PetscInt nonzerorow=0; 656 657 PetscFunctionBegin; 658 ierr = VecSet(zz,zero);CHKERRQ(ierr); 659 if (!a->nz) PetscFunctionReturn(0); 660 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 661 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 662 663 aj = a->j; 664 v = a->a; 665 ii = a->i; 666 667 if (!a->mult_work) { 668 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 669 } 670 work = a->mult_work; 671 672 for (i=0; i<mbs; i++) { 673 n = ii[1] - ii[0]; ncols = n*bs; 674 workt = work; idx=aj+ii[0]; 675 nonzerorow += (n>0); 676 677 /* upper triangular part */ 678 for (j=0; j<n; j++) { 679 xb = x_ptr + bs*(*idx++); 680 for (k=0; k<bs; k++) workt[k] = xb[k]; 681 workt += bs; 682 } 683 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 684 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 685 686 /* strict lower triangular part */ 687 idx = aj+ii[0]; 688 if (*idx == i) { 689 ncols -= bs; v += bs2; idx++; n--; 690 } 691 692 if (ncols > 0) { 693 workt = work; 694 ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr); 695 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 696 for (j=0; j<n; j++) { 697 zb = z_ptr + bs*(*idx++); 698 for (k=0; k<bs; k++) zb[k] += workt[k]; 699 workt += bs; 700 } 701 } 702 x += bs; v += n*bs2; z += bs; ii++; 703 } 704 705 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 706 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 707 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 708 PetscFunctionReturn(0); 709 } 710 711 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 712 { 713 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 714 PetscScalar *z,x1; 715 const PetscScalar *x,*xb; 716 const MatScalar *v; 717 PetscErrorCode ierr; 718 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 719 const PetscInt *aj=a->j,*ai=a->i,*ib; 720 PetscInt nonzerorow=0; 721 722 PetscFunctionBegin; 723 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 724 if (!a->nz) PetscFunctionReturn(0); 725 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 726 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 727 v = a->a; 728 xb = x; 729 730 for (i=0; i<mbs; i++) { 731 n = ai[1] - ai[0]; /* length of i_th row of A */ 732 x1 = xb[0]; 733 ib = aj + *ai; 734 jmin = 0; 735 nonzerorow += (n>0); 736 if (*ib == i) { /* (diag of A)*x */ 737 z[i] += *v++ * x[*ib++]; jmin++; 738 } 739 for (j=jmin; j<n; j++) { 740 cval = *ib; 741 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 742 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 743 } 744 xb++; ai++; 745 } 746 747 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 748 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 749 750 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 751 PetscFunctionReturn(0); 752 } 753 754 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 755 { 756 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 757 PetscScalar *z,x1,x2; 758 const PetscScalar *x,*xb; 759 const MatScalar *v; 760 PetscErrorCode ierr; 761 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 762 const PetscInt *aj=a->j,*ai=a->i,*ib; 763 PetscInt nonzerorow=0; 764 765 PetscFunctionBegin; 766 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 767 if (!a->nz) PetscFunctionReturn(0); 768 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 769 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 770 771 v = a->a; 772 xb = x; 773 774 for (i=0; i<mbs; i++) { 775 n = ai[1] - ai[0]; /* length of i_th block row of A */ 776 x1 = xb[0]; x2 = xb[1]; 777 ib = aj + *ai; 778 jmin = 0; 779 nonzerorow += (n>0); 780 if (*ib == i) { /* (diag of A)*x */ 781 z[2*i] += v[0]*x1 + v[2]*x2; 782 z[2*i+1] += v[2]*x1 + v[3]*x2; 783 v += 4; jmin++; 784 } 785 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 786 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 787 for (j=jmin; j<n; j++) { 788 /* (strict lower triangular part of A)*x */ 789 cval = ib[j]*2; 790 z[cval] += v[0]*x1 + v[1]*x2; 791 z[cval+1] += v[2]*x1 + v[3]*x2; 792 /* (strict upper triangular part of A)*x */ 793 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 794 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 795 v += 4; 796 } 797 xb +=2; ai++; 798 } 799 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 800 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 801 802 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 803 PetscFunctionReturn(0); 804 } 805 806 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 807 { 808 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 809 PetscScalar *z,x1,x2,x3; 810 const PetscScalar *x,*xb; 811 const MatScalar *v; 812 PetscErrorCode ierr; 813 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 814 const PetscInt *aj=a->j,*ai=a->i,*ib; 815 PetscInt nonzerorow=0; 816 817 PetscFunctionBegin; 818 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 819 if (!a->nz) PetscFunctionReturn(0); 820 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 821 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 822 823 v = a->a; 824 xb = x; 825 826 for (i=0; i<mbs; i++) { 827 n = ai[1] - ai[0]; /* length of i_th block row of A */ 828 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 829 ib = aj + *ai; 830 jmin = 0; 831 nonzerorow += (n>0); 832 if (*ib == i) { /* (diag of A)*x */ 833 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 834 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 835 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 836 v += 9; jmin++; 837 } 838 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 839 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 840 for (j=jmin; j<n; j++) { 841 /* (strict lower triangular part of A)*x */ 842 cval = ib[j]*3; 843 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 844 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 845 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 846 /* (strict upper triangular part of A)*x */ 847 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 848 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 849 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 850 v += 9; 851 } 852 xb +=3; ai++; 853 } 854 855 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 856 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 857 858 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 859 PetscFunctionReturn(0); 860 } 861 862 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 863 { 864 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 865 PetscScalar *z,x1,x2,x3,x4; 866 const PetscScalar *x,*xb; 867 const MatScalar *v; 868 PetscErrorCode ierr; 869 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 870 const PetscInt *aj=a->j,*ai=a->i,*ib; 871 PetscInt nonzerorow=0; 872 873 PetscFunctionBegin; 874 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 875 if (!a->nz) PetscFunctionReturn(0); 876 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 877 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 878 879 v = a->a; 880 xb = x; 881 882 for (i=0; i<mbs; i++) { 883 n = ai[1] - ai[0]; /* length of i_th block row of A */ 884 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 885 ib = aj + *ai; 886 jmin = 0; 887 nonzerorow += (n>0); 888 if (*ib == i) { /* (diag of A)*x */ 889 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 890 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 891 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 892 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 893 v += 16; jmin++; 894 } 895 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 896 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 897 for (j=jmin; j<n; j++) { 898 /* (strict lower triangular part of A)*x */ 899 cval = ib[j]*4; 900 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 901 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 902 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 903 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 904 /* (strict upper triangular part of A)*x */ 905 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 906 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 907 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 908 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 909 v += 16; 910 } 911 xb +=4; ai++; 912 } 913 914 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 915 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 916 917 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 918 PetscFunctionReturn(0); 919 } 920 921 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 922 { 923 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 924 PetscScalar *z,x1,x2,x3,x4,x5; 925 const PetscScalar *x,*xb; 926 const MatScalar *v; 927 PetscErrorCode ierr; 928 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 929 const PetscInt *aj=a->j,*ai=a->i,*ib; 930 PetscInt nonzerorow=0; 931 932 PetscFunctionBegin; 933 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 934 if (!a->nz) PetscFunctionReturn(0); 935 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 936 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 937 938 v = a->a; 939 xb = x; 940 941 for (i=0; i<mbs; i++) { 942 n = ai[1] - ai[0]; /* length of i_th block row of A */ 943 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 944 ib = aj + *ai; 945 jmin = 0; 946 nonzerorow += (n>0); 947 if (*ib == i) { /* (diag of A)*x */ 948 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 949 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 950 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 951 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 952 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 953 v += 25; jmin++; 954 } 955 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 956 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 957 for (j=jmin; j<n; j++) { 958 /* (strict lower triangular part of A)*x */ 959 cval = ib[j]*5; 960 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 961 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 962 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 963 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 964 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 965 /* (strict upper triangular part of A)*x */ 966 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 967 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 968 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 969 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 970 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 971 v += 25; 972 } 973 xb +=5; ai++; 974 } 975 976 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 977 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 978 979 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 980 PetscFunctionReturn(0); 981 } 982 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 983 { 984 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 985 PetscScalar *z,x1,x2,x3,x4,x5,x6; 986 const PetscScalar *x,*xb; 987 const MatScalar *v; 988 PetscErrorCode ierr; 989 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 990 const PetscInt *aj=a->j,*ai=a->i,*ib; 991 PetscInt nonzerorow=0; 992 993 PetscFunctionBegin; 994 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 995 if (!a->nz) PetscFunctionReturn(0); 996 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 997 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 998 999 v = a->a; 1000 xb = x; 1001 1002 for (i=0; i<mbs; i++) { 1003 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1004 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1005 ib = aj + *ai; 1006 jmin = 0; 1007 nonzerorow += (n>0); 1008 if (*ib == i) { /* (diag of A)*x */ 1009 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1010 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1011 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1012 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1013 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1014 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1015 v += 36; jmin++; 1016 } 1017 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1018 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1019 for (j=jmin; j<n; j++) { 1020 /* (strict lower triangular part of A)*x */ 1021 cval = ib[j]*6; 1022 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1023 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1024 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1025 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1026 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1027 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1028 /* (strict upper triangular part of A)*x */ 1029 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1030 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1031 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1032 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1033 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1034 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1035 v += 36; 1036 } 1037 xb +=6; ai++; 1038 } 1039 1040 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1041 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1042 1043 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1044 PetscFunctionReturn(0); 1045 } 1046 1047 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1048 { 1049 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1050 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1051 const PetscScalar *x,*xb; 1052 const MatScalar *v; 1053 PetscErrorCode ierr; 1054 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1055 const PetscInt *aj=a->j,*ai=a->i,*ib; 1056 PetscInt nonzerorow=0; 1057 1058 PetscFunctionBegin; 1059 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1060 if (!a->nz) PetscFunctionReturn(0); 1061 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1062 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1063 1064 v = a->a; 1065 xb = x; 1066 1067 for (i=0; i<mbs; i++) { 1068 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1069 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1070 ib = aj + *ai; 1071 jmin = 0; 1072 nonzerorow += (n>0); 1073 if (*ib == i) { /* (diag of A)*x */ 1074 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1075 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1076 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1077 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1078 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1079 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1080 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1081 v += 49; jmin++; 1082 } 1083 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1084 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1085 for (j=jmin; j<n; j++) { 1086 /* (strict lower triangular part of A)*x */ 1087 cval = ib[j]*7; 1088 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1089 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1090 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1091 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1092 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1093 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1094 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1095 /* (strict upper triangular part of A)*x */ 1096 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1097 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1098 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1099 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1100 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1101 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1102 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1103 v += 49; 1104 } 1105 xb +=7; ai++; 1106 } 1107 1108 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1109 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1110 1111 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1112 PetscFunctionReturn(0); 1113 } 1114 1115 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1116 { 1117 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1118 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1119 const PetscScalar *x,*x_ptr,*xb; 1120 const MatScalar *v; 1121 PetscErrorCode ierr; 1122 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1123 const PetscInt *idx,*aj,*ii; 1124 PetscInt nonzerorow=0; 1125 1126 PetscFunctionBegin; 1127 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1128 if (!a->nz) PetscFunctionReturn(0); 1129 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1130 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1131 1132 aj = a->j; 1133 v = a->a; 1134 ii = a->i; 1135 1136 if (!a->mult_work) { 1137 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1138 } 1139 work = a->mult_work; 1140 1141 1142 for (i=0; i<mbs; i++) { 1143 n = ii[1] - ii[0]; ncols = n*bs; 1144 workt = work; idx=aj+ii[0]; 1145 nonzerorow += (n>0); 1146 1147 /* upper triangular part */ 1148 for (j=0; j<n; j++) { 1149 xb = x_ptr + bs*(*idx++); 1150 for (k=0; k<bs; k++) workt[k] = xb[k]; 1151 workt += bs; 1152 } 1153 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1154 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1155 1156 /* strict lower triangular part */ 1157 idx = aj+ii[0]; 1158 if (*idx == i) { 1159 ncols -= bs; v += bs2; idx++; n--; 1160 } 1161 if (ncols > 0) { 1162 workt = work; 1163 ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr); 1164 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1165 for (j=0; j<n; j++) { 1166 zb = z_ptr + bs*(*idx++); 1167 for (k=0; k<bs; k++) zb[k] += workt[k]; 1168 workt += bs; 1169 } 1170 } 1171 1172 x += bs; v += n*bs2; z += bs; ii++; 1173 } 1174 1175 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1177 1178 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1179 PetscFunctionReturn(0); 1180 } 1181 1182 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1183 { 1184 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1185 PetscScalar oalpha = alpha; 1186 PetscErrorCode ierr; 1187 PetscBLASInt one = 1,totalnz; 1188 1189 PetscFunctionBegin; 1190 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1191 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1192 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1193 PetscFunctionReturn(0); 1194 } 1195 1196 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1197 { 1198 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1199 const MatScalar *v = a->a; 1200 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1201 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1202 PetscErrorCode ierr; 1203 const PetscInt *aj=a->j,*col; 1204 1205 PetscFunctionBegin; 1206 if (!a->nz) { 1207 *norm = 0.0; 1208 PetscFunctionReturn(0); 1209 } 1210 if (type == NORM_FROBENIUS) { 1211 for (k=0; k<mbs; k++) { 1212 jmin = a->i[k]; jmax = a->i[k+1]; 1213 col = aj + jmin; 1214 if (*col == k) { /* diagonal block */ 1215 for (i=0; i<bs2; i++) { 1216 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1217 } 1218 jmin++; 1219 } 1220 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1221 for (i=0; i<bs2; i++) { 1222 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1223 } 1224 } 1225 } 1226 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1227 ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr); 1228 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1229 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1230 for (i=0; i<mbs; i++) jl[i] = mbs; 1231 il[0] = 0; 1232 1233 *norm = 0.0; 1234 for (k=0; k<mbs; k++) { /* k_th block row */ 1235 for (j=0; j<bs; j++) sum[j]=0.0; 1236 /*-- col sum --*/ 1237 i = jl[k]; /* first |A(i,k)| to be added */ 1238 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1239 at step k */ 1240 while (i<mbs) { 1241 nexti = jl[i]; /* next block row to be added */ 1242 ik = il[i]; /* block index of A(i,k) in the array a */ 1243 for (j=0; j<bs; j++) { 1244 v = a->a + ik*bs2 + j*bs; 1245 for (k1=0; k1<bs; k1++) { 1246 sum[j] += PetscAbsScalar(*v); v++; 1247 } 1248 } 1249 /* update il, jl */ 1250 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1251 jmax = a->i[i+1]; 1252 if (jmin < jmax) { 1253 il[i] = jmin; 1254 j = a->j[jmin]; 1255 jl[i] = jl[j]; jl[j]=i; 1256 } 1257 i = nexti; 1258 } 1259 /*-- row sum --*/ 1260 jmin = a->i[k]; jmax = a->i[k+1]; 1261 for (i=jmin; i<jmax; i++) { 1262 for (j=0; j<bs; j++) { 1263 v = a->a + i*bs2 + j; 1264 for (k1=0; k1<bs; k1++) { 1265 sum[j] += PetscAbsScalar(*v); v += bs; 1266 } 1267 } 1268 } 1269 /* add k_th block row to il, jl */ 1270 col = aj+jmin; 1271 if (*col == k) jmin++; 1272 if (jmin < jmax) { 1273 il[k] = jmin; 1274 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1275 } 1276 for (j=0; j<bs; j++) { 1277 if (sum[j] > *norm) *norm = sum[j]; 1278 } 1279 } 1280 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1281 ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1282 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1283 PetscFunctionReturn(0); 1284 } 1285 1286 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1287 { 1288 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1289 PetscErrorCode ierr; 1290 1291 PetscFunctionBegin; 1292 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1293 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1294 *flg = PETSC_FALSE; 1295 PetscFunctionReturn(0); 1296 } 1297 1298 /* if the a->i are the same */ 1299 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 1300 if (!*flg) PetscFunctionReturn(0); 1301 1302 /* if a->j are the same */ 1303 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 1304 if (!*flg) PetscFunctionReturn(0); 1305 1306 /* if a->a are the same */ 1307 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr); 1308 PetscFunctionReturn(0); 1309 } 1310 1311 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1312 { 1313 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1314 PetscErrorCode ierr; 1315 PetscInt i,j,k,row,bs,ambs,bs2; 1316 const PetscInt *ai,*aj; 1317 PetscScalar *x,zero = 0.0; 1318 const MatScalar *aa,*aa_j; 1319 1320 PetscFunctionBegin; 1321 bs = A->rmap->bs; 1322 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1323 1324 aa = a->a; 1325 ambs = a->mbs; 1326 1327 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1328 PetscInt *diag=a->diag; 1329 aa = a->a; 1330 ambs = a->mbs; 1331 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1332 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1333 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1334 PetscFunctionReturn(0); 1335 } 1336 1337 ai = a->i; 1338 aj = a->j; 1339 bs2 = a->bs2; 1340 ierr = VecSet(v,zero);CHKERRQ(ierr); 1341 if (!a->nz) PetscFunctionReturn(0); 1342 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1343 for (i=0; i<ambs; i++) { 1344 j=ai[i]; 1345 if (aj[j] == i) { /* if this is a diagonal element */ 1346 row = i*bs; 1347 aa_j = aa + j*bs2; 1348 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1349 } 1350 } 1351 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1352 PetscFunctionReturn(0); 1353 } 1354 1355 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1356 { 1357 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1358 PetscScalar x; 1359 const PetscScalar *l,*li,*ri; 1360 MatScalar *aa,*v; 1361 PetscErrorCode ierr; 1362 PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2; 1363 const PetscInt *ai,*aj; 1364 PetscBool flg; 1365 1366 PetscFunctionBegin; 1367 if (ll != rr) { 1368 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1369 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1370 } 1371 if (!ll) PetscFunctionReturn(0); 1372 ai = a->i; 1373 aj = a->j; 1374 aa = a->a; 1375 m = A->rmap->N; 1376 bs = A->rmap->bs; 1377 mbs = a->mbs; 1378 bs2 = a->bs2; 1379 1380 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1381 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1382 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1383 for (i=0; i<mbs; i++) { /* for each block row */ 1384 M = ai[i+1] - ai[i]; 1385 li = l + i*bs; 1386 v = aa + bs2*ai[i]; 1387 for (j=0; j<M; j++) { /* for each block */ 1388 ri = l + bs*aj[ai[i]+j]; 1389 for (k=0; k<bs; k++) { 1390 x = ri[k]; 1391 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1392 } 1393 } 1394 } 1395 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1396 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1397 PetscFunctionReturn(0); 1398 } 1399 1400 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1401 { 1402 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1403 1404 PetscFunctionBegin; 1405 info->block_size = a->bs2; 1406 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1407 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1408 info->nz_unneeded = info->nz_allocated - info->nz_used; 1409 info->assemblies = A->num_ass; 1410 info->mallocs = A->info.mallocs; 1411 info->memory = ((PetscObject)A)->mem; 1412 if (A->factortype) { 1413 info->fill_ratio_given = A->info.fill_ratio_given; 1414 info->fill_ratio_needed = A->info.fill_ratio_needed; 1415 info->factor_mallocs = A->info.factor_mallocs; 1416 } else { 1417 info->fill_ratio_given = 0; 1418 info->fill_ratio_needed = 0; 1419 info->factor_mallocs = 0; 1420 } 1421 PetscFunctionReturn(0); 1422 } 1423 1424 1425 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1426 { 1427 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1428 PetscErrorCode ierr; 1429 1430 PetscFunctionBegin; 1431 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 1432 PetscFunctionReturn(0); 1433 } 1434 1435 /* 1436 This code does not work since it only checks the upper triangular part of 1437 the matrix. Hence it is not listed in the function table. 1438 */ 1439 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1440 { 1441 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1442 PetscErrorCode ierr; 1443 PetscInt i,j,n,row,col,bs,mbs; 1444 const PetscInt *ai,*aj; 1445 PetscReal atmp; 1446 const MatScalar *aa; 1447 PetscScalar *x; 1448 PetscInt ncols,brow,bcol,krow,kcol; 1449 1450 PetscFunctionBegin; 1451 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1452 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1453 bs = A->rmap->bs; 1454 aa = a->a; 1455 ai = a->i; 1456 aj = a->j; 1457 mbs = a->mbs; 1458 1459 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1460 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1461 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1462 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1463 for (i=0; i<mbs; i++) { 1464 ncols = ai[1] - ai[0]; ai++; 1465 brow = bs*i; 1466 for (j=0; j<ncols; j++) { 1467 bcol = bs*(*aj); 1468 for (kcol=0; kcol<bs; kcol++) { 1469 col = bcol + kcol; /* col index */ 1470 for (krow=0; krow<bs; krow++) { 1471 atmp = PetscAbsScalar(*aa); aa++; 1472 row = brow + krow; /* row index */ 1473 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1474 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1475 } 1476 } 1477 aj++; 1478 } 1479 } 1480 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1481 PetscFunctionReturn(0); 1482 } 1483