1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 9 { 10 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 11 PetscErrorCode ierr; 12 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 13 const PetscInt *idx; 14 PetscBT table_out,table_in; 15 16 PetscFunctionBegin; 17 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->rmap->bs; 22 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 23 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 24 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 25 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 26 27 for (i=0; i<is_max; i++) { /* for each is */ 28 isz = 0; 29 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 30 31 /* Extract the indices, assume there can be duplicate entries */ 32 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 33 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 34 35 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 36 bcol_max = 0; 37 for (j=0; j<n; ++j) { 38 brow = idx[j]/bs; /* convert the indices into block indices */ 39 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 40 if (!PetscBTLookupSet(table_out,brow)) { 41 nidx[isz++] = brow; 42 if (bcol_max < brow) bcol_max = brow; 43 } 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap */ 50 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 51 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 52 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 53 54 n = isz; /* length of the updated is[i] */ 55 for (brow=0; brow<mbs; brow++) { 56 start = ai[brow]; end = ai[brow+1]; 57 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58 for (l = start; l<end; l++) { 59 bcol = aj[l]; 60 if (!PetscBTLookupSet(table_out,bcol)) { 61 nidx[isz++] = bcol; 62 if (bcol_max < bcol) bcol_max = bcol; 63 } 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end; l++) { 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table_in,bcol)) { 72 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 94 Zero some ops' to avoid invalid usse */ 95 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 96 { 97 PetscErrorCode ierr; 98 99 PetscFunctionBegin; 100 ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 101 Bseq->ops->mult = 0; 102 Bseq->ops->multadd = 0; 103 Bseq->ops->multtranspose = 0; 104 Bseq->ops->multtransposeadd = 0; 105 Bseq->ops->lufactor = 0; 106 Bseq->ops->choleskyfactor = 0; 107 Bseq->ops->lufactorsymbolic = 0; 108 Bseq->ops->choleskyfactorsymbolic = 0; 109 Bseq->ops->getinertia = 0; 110 PetscFunctionReturn(0); 111 } 112 113 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 114 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 115 { 116 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 117 PetscErrorCode ierr; 118 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 119 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 120 const PetscInt *irow,*icol; 121 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 122 PetscInt *aj = a->j,*ai = a->i; 123 MatScalar *mat_a; 124 Mat C; 125 PetscBool flag; 126 127 PetscFunctionBegin; 128 129 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 130 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 131 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 132 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 133 134 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 135 ssmap = smap; 136 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 137 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 138 /* determine lens of each row */ 139 for (i=0; i<nrows; i++) { 140 kstart = ai[irow[i]]; 141 kend = kstart + a->ilen[irow[i]]; 142 lens[i] = 0; 143 for (k=kstart; k<kend; k++) { 144 if (ssmap[aj[k]]) lens[i]++; 145 } 146 } 147 /* Create and fill new matrix */ 148 if (scall == MAT_REUSE_MATRIX) { 149 c = (Mat_SeqSBAIJ*)((*B)->data); 150 151 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 152 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 153 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 154 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 155 C = *B; 156 } else { 157 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 158 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 159 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 160 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 161 } 162 c = (Mat_SeqSBAIJ*)(C->data); 163 for (i=0; i<nrows; i++) { 164 row = irow[i]; 165 kstart = ai[row]; 166 kend = kstart + a->ilen[row]; 167 mat_i = c->i[i]; 168 mat_j = c->j + mat_i; 169 mat_a = c->a + mat_i*bs2; 170 mat_ilen = c->ilen + i; 171 for (k=kstart; k<kend; k++) { 172 if ((tcol=ssmap[a->j[k]])) { 173 *mat_j++ = tcol - 1; 174 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 175 mat_a += bs2; 176 (*mat_ilen)++; 177 } 178 } 179 } 180 /* sort */ 181 { 182 MatScalar *work; 183 184 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 185 for (i=0; i<nrows; i++) { 186 PetscInt ilen; 187 mat_i = c->i[i]; 188 mat_j = c->j + mat_i; 189 mat_a = c->a + mat_i*bs2; 190 ilen = c->ilen[i]; 191 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 192 } 193 ierr = PetscFree(work);CHKERRQ(ierr); 194 } 195 196 /* Free work space */ 197 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 198 ierr = PetscFree(smap);CHKERRQ(ierr); 199 ierr = PetscFree(lens);CHKERRQ(ierr); 200 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 201 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 202 203 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 204 *B = C; 205 PetscFunctionReturn(0); 206 } 207 208 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 209 { 210 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 211 IS is1,is2; 212 PetscErrorCode ierr; 213 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 214 const PetscInt *irow,*icol; 215 216 PetscFunctionBegin; 217 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 218 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 219 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 220 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 221 222 /* Verify if the indices corespond to each element in a block 223 and form the IS with compressed IS */ 224 maxmnbs = PetscMax(a->mbs,a->nbs); 225 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 226 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 227 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 228 for (i=0; i<a->mbs; i++) { 229 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 230 } 231 count = 0; 232 for (i=0; i<nrows; i++) { 233 PetscInt j = irow[i] / bs; 234 if ((vary[j]--)==bs) iary[count++] = j; 235 } 236 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 237 238 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 239 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 240 for (i=0; i<a->nbs; i++) { 241 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 242 } 243 count = 0; 244 for (i=0; i<ncols; i++) { 245 PetscInt j = icol[i] / bs; 246 if ((vary[j]--)==bs) iary[count++] = j; 247 } 248 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 249 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 250 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 251 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 252 253 ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 254 ierr = ISDestroy(&is1);CHKERRQ(ierr); 255 ierr = ISDestroy(&is2);CHKERRQ(ierr); 256 257 if (isrow != iscol) { 258 PetscBool isequal; 259 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 260 if (!isequal) { 261 ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 262 } 263 } 264 PetscFunctionReturn(0); 265 } 266 267 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 268 { 269 PetscErrorCode ierr; 270 PetscInt i; 271 272 PetscFunctionBegin; 273 if (scall == MAT_INITIAL_MATRIX) { 274 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 275 } 276 277 for (i=0; i<n; i++) { 278 ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 279 } 280 PetscFunctionReturn(0); 281 } 282 283 /* -------------------------------------------------------*/ 284 /* Should check that shapes of vectors and matrices match */ 285 /* -------------------------------------------------------*/ 286 287 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 288 { 289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 290 PetscScalar *z,x1,x2,zero=0.0; 291 const PetscScalar *x,*xb; 292 const MatScalar *v; 293 PetscErrorCode ierr; 294 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 295 const PetscInt *aj=a->j,*ai=a->i,*ib; 296 PetscInt nonzerorow=0; 297 298 PetscFunctionBegin; 299 ierr = VecSet(zz,zero);CHKERRQ(ierr); 300 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 301 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 302 303 v = a->a; 304 xb = x; 305 306 for (i=0; i<mbs; i++) { 307 n = ai[1] - ai[0]; /* length of i_th block row of A */ 308 x1 = xb[0]; x2 = xb[1]; 309 ib = aj + *ai; 310 jmin = 0; 311 nonzerorow += (n>0); 312 if (*ib == i) { /* (diag of A)*x */ 313 z[2*i] += v[0]*x1 + v[2]*x2; 314 z[2*i+1] += v[2]*x1 + v[3]*x2; 315 v += 4; jmin++; 316 } 317 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 318 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 319 for (j=jmin; j<n; j++) { 320 /* (strict lower triangular part of A)*x */ 321 cval = ib[j]*2; 322 z[cval] += v[0]*x1 + v[1]*x2; 323 z[cval+1] += v[2]*x1 + v[3]*x2; 324 /* (strict upper triangular part of A)*x */ 325 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 326 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 327 v += 4; 328 } 329 xb +=2; ai++; 330 } 331 332 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 333 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 334 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 335 PetscFunctionReturn(0); 336 } 337 338 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 339 { 340 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 341 PetscScalar *z,x1,x2,x3,zero=0.0; 342 const PetscScalar *x,*xb; 343 const MatScalar *v; 344 PetscErrorCode ierr; 345 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 346 const PetscInt *aj = a->j,*ai = a->i,*ib; 347 PetscInt nonzerorow=0; 348 349 PetscFunctionBegin; 350 ierr = VecSet(zz,zero);CHKERRQ(ierr); 351 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 352 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 353 354 v = a->a; 355 xb = x; 356 357 for (i=0; i<mbs; i++) { 358 n = ai[1] - ai[0]; /* length of i_th block row of A */ 359 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 360 ib = aj + *ai; 361 jmin = 0; 362 nonzerorow += (n>0); 363 if (*ib == i) { /* (diag of A)*x */ 364 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 365 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 366 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 367 v += 9; jmin++; 368 } 369 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 370 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 371 for (j=jmin; j<n; j++) { 372 /* (strict lower triangular part of A)*x */ 373 cval = ib[j]*3; 374 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 375 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 376 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 377 /* (strict upper triangular part of A)*x */ 378 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 379 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 380 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 381 v += 9; 382 } 383 xb +=3; ai++; 384 } 385 386 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 387 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 388 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 389 PetscFunctionReturn(0); 390 } 391 392 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 393 { 394 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 395 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 396 const PetscScalar *x,*xb; 397 const MatScalar *v; 398 PetscErrorCode ierr; 399 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 400 const PetscInt *aj = a->j,*ai = a->i,*ib; 401 PetscInt nonzerorow = 0; 402 403 PetscFunctionBegin; 404 ierr = VecSet(zz,zero);CHKERRQ(ierr); 405 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 406 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 407 408 v = a->a; 409 xb = x; 410 411 for (i=0; i<mbs; i++) { 412 n = ai[1] - ai[0]; /* length of i_th block row of A */ 413 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 414 ib = aj + *ai; 415 jmin = 0; 416 nonzerorow += (n>0); 417 if (*ib == i) { /* (diag of A)*x */ 418 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 419 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 420 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 421 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 422 v += 16; jmin++; 423 } 424 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 425 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 426 for (j=jmin; j<n; j++) { 427 /* (strict lower triangular part of A)*x */ 428 cval = ib[j]*4; 429 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 430 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 431 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 432 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 433 /* (strict upper triangular part of A)*x */ 434 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 435 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 436 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 437 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 438 v += 16; 439 } 440 xb +=4; ai++; 441 } 442 443 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 444 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 445 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 446 PetscFunctionReturn(0); 447 } 448 449 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 450 { 451 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 452 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 453 const PetscScalar *x,*xb; 454 const MatScalar *v; 455 PetscErrorCode ierr; 456 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 457 const PetscInt *aj = a->j,*ai = a->i,*ib; 458 PetscInt nonzerorow=0; 459 460 PetscFunctionBegin; 461 ierr = VecSet(zz,zero);CHKERRQ(ierr); 462 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 463 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 464 465 v = a->a; 466 xb = x; 467 468 for (i=0; i<mbs; i++) { 469 n = ai[1] - ai[0]; /* length of i_th block row of A */ 470 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 471 ib = aj + *ai; 472 jmin = 0; 473 nonzerorow += (n>0); 474 if (*ib == i) { /* (diag of A)*x */ 475 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 476 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 477 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 478 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 479 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 480 v += 25; jmin++; 481 } 482 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 483 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 484 for (j=jmin; j<n; j++) { 485 /* (strict lower triangular part of A)*x */ 486 cval = ib[j]*5; 487 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 488 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 489 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 490 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 491 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 492 /* (strict upper triangular part of A)*x */ 493 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 494 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 495 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 496 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 497 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 498 v += 25; 499 } 500 xb +=5; ai++; 501 } 502 503 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 504 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 505 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 506 PetscFunctionReturn(0); 507 } 508 509 510 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 511 { 512 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 513 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 514 const PetscScalar *x,*xb; 515 const MatScalar *v; 516 PetscErrorCode ierr; 517 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 518 const PetscInt *aj=a->j,*ai=a->i,*ib; 519 PetscInt nonzerorow=0; 520 521 PetscFunctionBegin; 522 ierr = VecSet(zz,zero);CHKERRQ(ierr); 523 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 524 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 525 526 v = a->a; 527 xb = x; 528 529 for (i=0; i<mbs; i++) { 530 n = ai[1] - ai[0]; /* length of i_th block row of A */ 531 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 532 ib = aj + *ai; 533 jmin = 0; 534 nonzerorow += (n>0); 535 if (*ib == i) { /* (diag of A)*x */ 536 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 537 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 538 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 539 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 540 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 541 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 542 v += 36; jmin++; 543 } 544 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 545 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 546 for (j=jmin; j<n; j++) { 547 /* (strict lower triangular part of A)*x */ 548 cval = ib[j]*6; 549 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 550 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 551 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 552 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 553 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 554 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 555 /* (strict upper triangular part of A)*x */ 556 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 557 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 558 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 559 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 560 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 561 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 562 v += 36; 563 } 564 xb +=6; ai++; 565 } 566 567 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 568 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 569 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 570 PetscFunctionReturn(0); 571 } 572 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 573 { 574 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 575 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 576 const PetscScalar *x,*xb; 577 const MatScalar *v; 578 PetscErrorCode ierr; 579 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 580 const PetscInt *aj=a->j,*ai=a->i,*ib; 581 PetscInt nonzerorow=0; 582 583 PetscFunctionBegin; 584 ierr = VecSet(zz,zero);CHKERRQ(ierr); 585 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 586 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 587 588 v = a->a; 589 xb = x; 590 591 for (i=0; i<mbs; i++) { 592 n = ai[1] - ai[0]; /* length of i_th block row of A */ 593 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 594 ib = aj + *ai; 595 jmin = 0; 596 nonzerorow += (n>0); 597 if (*ib == i) { /* (diag of A)*x */ 598 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 599 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 600 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 601 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 602 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 603 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 604 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 605 v += 49; jmin++; 606 } 607 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 608 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 609 for (j=jmin; j<n; j++) { 610 /* (strict lower triangular part of A)*x */ 611 cval = ib[j]*7; 612 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 613 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 614 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 615 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 616 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 617 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 618 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 619 /* (strict upper triangular part of A)*x */ 620 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 621 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 622 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 623 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 624 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 625 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 626 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 627 v += 49; 628 } 629 xb +=7; ai++; 630 } 631 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 632 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 633 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 634 PetscFunctionReturn(0); 635 } 636 637 /* 638 This will not work with MatScalar == float because it calls the BLAS 639 */ 640 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 641 { 642 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 643 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 644 const PetscScalar *x,*x_ptr,*xb; 645 const MatScalar *v; 646 PetscErrorCode ierr; 647 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 648 const PetscInt *idx,*aj,*ii; 649 PetscInt nonzerorow=0; 650 651 PetscFunctionBegin; 652 ierr = VecSet(zz,zero);CHKERRQ(ierr); 653 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 654 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 655 656 aj = a->j; 657 v = a->a; 658 ii = a->i; 659 660 if (!a->mult_work) { 661 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 662 } 663 work = a->mult_work; 664 665 for (i=0; i<mbs; i++) { 666 n = ii[1] - ii[0]; ncols = n*bs; 667 workt = work; idx=aj+ii[0]; 668 nonzerorow += (n>0); 669 670 /* upper triangular part */ 671 for (j=0; j<n; j++) { 672 xb = x_ptr + bs*(*idx++); 673 for (k=0; k<bs; k++) workt[k] = xb[k]; 674 workt += bs; 675 } 676 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 677 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 678 679 /* strict lower triangular part */ 680 idx = aj+ii[0]; 681 if (*idx == i) { 682 ncols -= bs; v += bs2; idx++; n--; 683 } 684 685 if (ncols > 0) { 686 workt = work; 687 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 688 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 689 for (j=0; j<n; j++) { 690 zb = z_ptr + bs*(*idx++); 691 for (k=0; k<bs; k++) zb[k] += workt[k]; 692 workt += bs; 693 } 694 } 695 x += bs; v += n*bs2; z += bs; ii++; 696 } 697 698 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 699 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 700 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 701 PetscFunctionReturn(0); 702 } 703 704 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 705 { 706 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 707 PetscScalar *z,x1; 708 const PetscScalar *x,*xb; 709 const MatScalar *v; 710 PetscErrorCode ierr; 711 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 712 const PetscInt *aj=a->j,*ai=a->i,*ib; 713 PetscInt nonzerorow=0; 714 715 PetscFunctionBegin; 716 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 717 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 718 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 719 v = a->a; 720 xb = x; 721 722 for (i=0; i<mbs; i++) { 723 n = ai[1] - ai[0]; /* length of i_th row of A */ 724 x1 = xb[0]; 725 ib = aj + *ai; 726 jmin = 0; 727 nonzerorow += (n>0); 728 if (*ib == i) { /* (diag of A)*x */ 729 z[i] += *v++ * x[*ib++]; jmin++; 730 } 731 for (j=jmin; j<n; j++) { 732 cval = *ib; 733 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 734 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 735 } 736 xb++; ai++; 737 } 738 739 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 740 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 741 742 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 743 PetscFunctionReturn(0); 744 } 745 746 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 747 { 748 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 749 PetscScalar *z,x1,x2; 750 const PetscScalar *x,*xb; 751 const MatScalar *v; 752 PetscErrorCode ierr; 753 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 754 const PetscInt *aj=a->j,*ai=a->i,*ib; 755 PetscInt nonzerorow=0; 756 757 PetscFunctionBegin; 758 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 759 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 760 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 761 762 v = a->a; 763 xb = x; 764 765 for (i=0; i<mbs; i++) { 766 n = ai[1] - ai[0]; /* length of i_th block row of A */ 767 x1 = xb[0]; x2 = xb[1]; 768 ib = aj + *ai; 769 jmin = 0; 770 nonzerorow += (n>0); 771 if (*ib == i) { /* (diag of A)*x */ 772 z[2*i] += v[0]*x1 + v[2]*x2; 773 z[2*i+1] += v[2]*x1 + v[3]*x2; 774 v += 4; jmin++; 775 } 776 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 777 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 778 for (j=jmin; j<n; j++) { 779 /* (strict lower triangular part of A)*x */ 780 cval = ib[j]*2; 781 z[cval] += v[0]*x1 + v[1]*x2; 782 z[cval+1] += v[2]*x1 + v[3]*x2; 783 /* (strict upper triangular part of A)*x */ 784 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 785 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 786 v += 4; 787 } 788 xb +=2; ai++; 789 } 790 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 791 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 792 793 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 798 { 799 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 800 PetscScalar *z,x1,x2,x3; 801 const PetscScalar *x,*xb; 802 const MatScalar *v; 803 PetscErrorCode ierr; 804 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 805 const PetscInt *aj=a->j,*ai=a->i,*ib; 806 PetscInt nonzerorow=0; 807 808 PetscFunctionBegin; 809 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 810 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 811 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 812 813 v = a->a; 814 xb = x; 815 816 for (i=0; i<mbs; i++) { 817 n = ai[1] - ai[0]; /* length of i_th block row of A */ 818 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 819 ib = aj + *ai; 820 jmin = 0; 821 nonzerorow += (n>0); 822 if (*ib == i) { /* (diag of A)*x */ 823 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 824 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 825 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 826 v += 9; jmin++; 827 } 828 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 829 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 830 for (j=jmin; j<n; j++) { 831 /* (strict lower triangular part of A)*x */ 832 cval = ib[j]*3; 833 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 834 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 835 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 836 /* (strict upper triangular part of A)*x */ 837 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 838 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 839 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 840 v += 9; 841 } 842 xb +=3; ai++; 843 } 844 845 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 846 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 847 848 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 849 PetscFunctionReturn(0); 850 } 851 852 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 853 { 854 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 855 PetscScalar *z,x1,x2,x3,x4; 856 const PetscScalar *x,*xb; 857 const MatScalar *v; 858 PetscErrorCode ierr; 859 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 860 const PetscInt *aj=a->j,*ai=a->i,*ib; 861 PetscInt nonzerorow=0; 862 863 PetscFunctionBegin; 864 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 865 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 866 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 867 868 v = a->a; 869 xb = x; 870 871 for (i=0; i<mbs; i++) { 872 n = ai[1] - ai[0]; /* length of i_th block row of A */ 873 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 874 ib = aj + *ai; 875 jmin = 0; 876 nonzerorow += (n>0); 877 if (*ib == i) { /* (diag of A)*x */ 878 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 879 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 880 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 881 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 882 v += 16; jmin++; 883 } 884 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 885 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 886 for (j=jmin; j<n; j++) { 887 /* (strict lower triangular part of A)*x */ 888 cval = ib[j]*4; 889 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 890 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 891 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 892 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 893 /* (strict upper triangular part of A)*x */ 894 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 895 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 896 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 897 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 898 v += 16; 899 } 900 xb +=4; ai++; 901 } 902 903 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 904 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 905 906 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 907 PetscFunctionReturn(0); 908 } 909 910 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 911 { 912 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 913 PetscScalar *z,x1,x2,x3,x4,x5; 914 const PetscScalar *x,*xb; 915 const MatScalar *v; 916 PetscErrorCode ierr; 917 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 918 const PetscInt *aj=a->j,*ai=a->i,*ib; 919 PetscInt nonzerorow=0; 920 921 PetscFunctionBegin; 922 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 923 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 924 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 925 926 v = a->a; 927 xb = x; 928 929 for (i=0; i<mbs; i++) { 930 n = ai[1] - ai[0]; /* length of i_th block row of A */ 931 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 932 ib = aj + *ai; 933 jmin = 0; 934 nonzerorow += (n>0); 935 if (*ib == i) { /* (diag of A)*x */ 936 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 937 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 938 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 939 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 940 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 941 v += 25; jmin++; 942 } 943 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 944 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 945 for (j=jmin; j<n; j++) { 946 /* (strict lower triangular part of A)*x */ 947 cval = ib[j]*5; 948 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 949 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 950 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 951 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 952 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 953 /* (strict upper triangular part of A)*x */ 954 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 955 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 956 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 957 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 958 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 959 v += 25; 960 } 961 xb +=5; ai++; 962 } 963 964 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 965 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 966 967 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 968 PetscFunctionReturn(0); 969 } 970 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 971 { 972 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 973 PetscScalar *z,x1,x2,x3,x4,x5,x6; 974 const PetscScalar *x,*xb; 975 const MatScalar *v; 976 PetscErrorCode ierr; 977 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 978 const PetscInt *aj=a->j,*ai=a->i,*ib; 979 PetscInt nonzerorow=0; 980 981 PetscFunctionBegin; 982 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 983 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 984 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 985 986 v = a->a; 987 xb = x; 988 989 for (i=0; i<mbs; i++) { 990 n = ai[1] - ai[0]; /* length of i_th block row of A */ 991 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 992 ib = aj + *ai; 993 jmin = 0; 994 nonzerorow += (n>0); 995 if (*ib == i) { /* (diag of A)*x */ 996 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 997 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 998 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 999 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1000 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1001 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1002 v += 36; jmin++; 1003 } 1004 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1005 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1006 for (j=jmin; j<n; j++) { 1007 /* (strict lower triangular part of A)*x */ 1008 cval = ib[j]*6; 1009 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1010 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1011 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1012 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1013 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1014 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1015 /* (strict upper triangular part of A)*x */ 1016 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1017 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1018 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1019 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1020 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1021 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1022 v += 36; 1023 } 1024 xb +=6; ai++; 1025 } 1026 1027 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1028 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1029 1030 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1031 PetscFunctionReturn(0); 1032 } 1033 1034 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1035 { 1036 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1037 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1038 const PetscScalar *x,*xb; 1039 const MatScalar *v; 1040 PetscErrorCode ierr; 1041 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1042 const PetscInt *aj=a->j,*ai=a->i,*ib; 1043 PetscInt nonzerorow=0; 1044 1045 PetscFunctionBegin; 1046 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1047 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1048 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1049 1050 v = a->a; 1051 xb = x; 1052 1053 for (i=0; i<mbs; i++) { 1054 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1055 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1056 ib = aj + *ai; 1057 jmin = 0; 1058 nonzerorow += (n>0); 1059 if (*ib == i) { /* (diag of A)*x */ 1060 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1061 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1062 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1063 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1064 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1065 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1066 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1067 v += 49; jmin++; 1068 } 1069 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1070 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1071 for (j=jmin; j<n; j++) { 1072 /* (strict lower triangular part of A)*x */ 1073 cval = ib[j]*7; 1074 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1075 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1076 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1077 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1078 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1079 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1080 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1081 /* (strict upper triangular part of A)*x */ 1082 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1083 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1084 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1085 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1086 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1087 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1088 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1089 v += 49; 1090 } 1091 xb +=7; ai++; 1092 } 1093 1094 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1095 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1096 1097 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1098 PetscFunctionReturn(0); 1099 } 1100 1101 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1102 { 1103 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1104 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1105 const PetscScalar *x,*x_ptr,*xb; 1106 const MatScalar *v; 1107 PetscErrorCode ierr; 1108 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1109 const PetscInt *idx,*aj,*ii; 1110 PetscInt nonzerorow=0; 1111 1112 PetscFunctionBegin; 1113 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1114 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1115 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1116 1117 aj = a->j; 1118 v = a->a; 1119 ii = a->i; 1120 1121 if (!a->mult_work) { 1122 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1123 } 1124 work = a->mult_work; 1125 1126 1127 for (i=0; i<mbs; i++) { 1128 n = ii[1] - ii[0]; ncols = n*bs; 1129 workt = work; idx=aj+ii[0]; 1130 nonzerorow += (n>0); 1131 1132 /* upper triangular part */ 1133 for (j=0; j<n; j++) { 1134 xb = x_ptr + bs*(*idx++); 1135 for (k=0; k<bs; k++) workt[k] = xb[k]; 1136 workt += bs; 1137 } 1138 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1139 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1140 1141 /* strict lower triangular part */ 1142 idx = aj+ii[0]; 1143 if (*idx == i) { 1144 ncols -= bs; v += bs2; idx++; n--; 1145 } 1146 if (ncols > 0) { 1147 workt = work; 1148 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1149 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1150 for (j=0; j<n; j++) { 1151 zb = z_ptr + bs*(*idx++); 1152 for (k=0; k<bs; k++) zb[k] += workt[k]; 1153 workt += bs; 1154 } 1155 } 1156 1157 x += bs; v += n*bs2; z += bs; ii++; 1158 } 1159 1160 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1161 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1162 1163 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1164 PetscFunctionReturn(0); 1165 } 1166 1167 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1168 { 1169 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1170 PetscScalar oalpha = alpha; 1171 PetscErrorCode ierr; 1172 PetscBLASInt one = 1,totalnz; 1173 1174 PetscFunctionBegin; 1175 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1176 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1177 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1182 { 1183 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1184 const MatScalar *v = a->a; 1185 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1186 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1187 PetscErrorCode ierr; 1188 const PetscInt *aj=a->j,*col; 1189 1190 PetscFunctionBegin; 1191 if (type == NORM_FROBENIUS) { 1192 for (k=0; k<mbs; k++) { 1193 jmin = a->i[k]; jmax = a->i[k+1]; 1194 col = aj + jmin; 1195 if (*col == k) { /* diagonal block */ 1196 for (i=0; i<bs2; i++) { 1197 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1198 } 1199 jmin++; 1200 } 1201 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1202 for (i=0; i<bs2; i++) { 1203 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1204 } 1205 } 1206 } 1207 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1208 ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr); 1209 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1210 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1211 for (i=0; i<mbs; i++) jl[i] = mbs; 1212 il[0] = 0; 1213 1214 *norm = 0.0; 1215 for (k=0; k<mbs; k++) { /* k_th block row */ 1216 for (j=0; j<bs; j++) sum[j]=0.0; 1217 /*-- col sum --*/ 1218 i = jl[k]; /* first |A(i,k)| to be added */ 1219 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1220 at step k */ 1221 while (i<mbs) { 1222 nexti = jl[i]; /* next block row to be added */ 1223 ik = il[i]; /* block index of A(i,k) in the array a */ 1224 for (j=0; j<bs; j++) { 1225 v = a->a + ik*bs2 + j*bs; 1226 for (k1=0; k1<bs; k1++) { 1227 sum[j] += PetscAbsScalar(*v); v++; 1228 } 1229 } 1230 /* update il, jl */ 1231 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1232 jmax = a->i[i+1]; 1233 if (jmin < jmax) { 1234 il[i] = jmin; 1235 j = a->j[jmin]; 1236 jl[i] = jl[j]; jl[j]=i; 1237 } 1238 i = nexti; 1239 } 1240 /*-- row sum --*/ 1241 jmin = a->i[k]; jmax = a->i[k+1]; 1242 for (i=jmin; i<jmax; i++) { 1243 for (j=0; j<bs; j++) { 1244 v = a->a + i*bs2 + j; 1245 for (k1=0; k1<bs; k1++) { 1246 sum[j] += PetscAbsScalar(*v); v += bs; 1247 } 1248 } 1249 } 1250 /* add k_th block row to il, jl */ 1251 col = aj+jmin; 1252 if (*col == k) jmin++; 1253 if (jmin < jmax) { 1254 il[k] = jmin; 1255 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1256 } 1257 for (j=0; j<bs; j++) { 1258 if (sum[j] > *norm) *norm = sum[j]; 1259 } 1260 } 1261 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1262 ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1263 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1264 PetscFunctionReturn(0); 1265 } 1266 1267 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1268 { 1269 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1270 PetscErrorCode ierr; 1271 1272 PetscFunctionBegin; 1273 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1274 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1275 *flg = PETSC_FALSE; 1276 PetscFunctionReturn(0); 1277 } 1278 1279 /* if the a->i are the same */ 1280 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1281 if (!*flg) PetscFunctionReturn(0); 1282 1283 /* if a->j are the same */ 1284 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1285 if (!*flg) PetscFunctionReturn(0); 1286 1287 /* if a->a are the same */ 1288 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1289 PetscFunctionReturn(0); 1290 } 1291 1292 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1293 { 1294 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1295 PetscErrorCode ierr; 1296 PetscInt i,j,k,row,bs,ambs,bs2; 1297 const PetscInt *ai,*aj; 1298 PetscScalar *x,zero = 0.0; 1299 const MatScalar *aa,*aa_j; 1300 1301 PetscFunctionBegin; 1302 bs = A->rmap->bs; 1303 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1304 1305 aa = a->a; 1306 ambs = a->mbs; 1307 1308 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1309 PetscInt *diag=a->diag; 1310 aa = a->a; 1311 ambs = a->mbs; 1312 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1313 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1314 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1315 PetscFunctionReturn(0); 1316 } 1317 1318 ai = a->i; 1319 aj = a->j; 1320 bs2 = a->bs2; 1321 ierr = VecSet(v,zero);CHKERRQ(ierr); 1322 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1323 for (i=0; i<ambs; i++) { 1324 j=ai[i]; 1325 if (aj[j] == i) { /* if this is a diagonal element */ 1326 row = i*bs; 1327 aa_j = aa + j*bs2; 1328 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1329 } 1330 } 1331 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1332 PetscFunctionReturn(0); 1333 } 1334 1335 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1336 { 1337 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1338 PetscScalar x; 1339 const PetscScalar *l,*li,*ri; 1340 MatScalar *aa,*v; 1341 PetscErrorCode ierr; 1342 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1343 PetscBool flg; 1344 1345 PetscFunctionBegin; 1346 if (ll != rr) { 1347 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1348 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1349 } 1350 if (!ll) PetscFunctionReturn(0); 1351 ai = a->i; 1352 aj = a->j; 1353 aa = a->a; 1354 m = A->rmap->N; 1355 bs = A->rmap->bs; 1356 mbs = a->mbs; 1357 bs2 = a->bs2; 1358 1359 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1360 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1361 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1362 for (i=0; i<mbs; i++) { /* for each block row */ 1363 M = ai[i+1] - ai[i]; 1364 li = l + i*bs; 1365 v = aa + bs2*ai[i]; 1366 for (j=0; j<M; j++) { /* for each block */ 1367 ri = l + bs*aj[ai[i]+j]; 1368 for (k=0; k<bs; k++) { 1369 x = ri[k]; 1370 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1371 } 1372 } 1373 } 1374 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1375 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1376 PetscFunctionReturn(0); 1377 } 1378 1379 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1380 { 1381 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1382 1383 PetscFunctionBegin; 1384 info->block_size = a->bs2; 1385 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1386 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1387 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1388 info->assemblies = A->num_ass; 1389 info->mallocs = A->info.mallocs; 1390 info->memory = ((PetscObject)A)->mem; 1391 if (A->factortype) { 1392 info->fill_ratio_given = A->info.fill_ratio_given; 1393 info->fill_ratio_needed = A->info.fill_ratio_needed; 1394 info->factor_mallocs = A->info.factor_mallocs; 1395 } else { 1396 info->fill_ratio_given = 0; 1397 info->fill_ratio_needed = 0; 1398 info->factor_mallocs = 0; 1399 } 1400 PetscFunctionReturn(0); 1401 } 1402 1403 1404 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1405 { 1406 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1407 PetscErrorCode ierr; 1408 1409 PetscFunctionBegin; 1410 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1411 PetscFunctionReturn(0); 1412 } 1413 1414 /* 1415 This code does not work since it only checks the upper triangular part of 1416 the matrix. Hence it is not listed in the function table. 1417 */ 1418 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1419 { 1420 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1421 PetscErrorCode ierr; 1422 PetscInt i,j,n,row,col,bs,mbs; 1423 const PetscInt *ai,*aj; 1424 PetscReal atmp; 1425 const MatScalar *aa; 1426 PetscScalar *x; 1427 PetscInt ncols,brow,bcol,krow,kcol; 1428 1429 PetscFunctionBegin; 1430 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1431 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1432 bs = A->rmap->bs; 1433 aa = a->a; 1434 ai = a->i; 1435 aj = a->j; 1436 mbs = a->mbs; 1437 1438 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1439 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1440 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1441 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1442 for (i=0; i<mbs; i++) { 1443 ncols = ai[1] - ai[0]; ai++; 1444 brow = bs*i; 1445 for (j=0; j<ncols; j++) { 1446 bcol = bs*(*aj); 1447 for (kcol=0; kcol<bs; kcol++) { 1448 col = bcol + kcol; /* col index */ 1449 for (krow=0; krow<bs; krow++) { 1450 atmp = PetscAbsScalar(*aa); aa++; 1451 row = brow + krow; /* row index */ 1452 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1453 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1454 } 1455 } 1456 aj++; 1457 } 1458 } 1459 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1460 PetscFunctionReturn(0); 1461 } 1462