1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <../src/mat/impls/sbaij/seq/sbaij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 #include <petscbt.h> 7 #include <petscblaslapack.h> 8 9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 14 const PetscInt *idx; 15 PetscBT table_out,table_in; 16 17 PetscFunctionBegin; 18 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 19 mbs = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 24 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 25 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 26 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 27 28 for (i=0; i<is_max; i++) { /* for each is */ 29 isz = 0; 30 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 31 32 /* Extract the indices, assume there can be duplicate entries */ 33 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 34 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 35 36 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 37 bcol_max = 0; 38 for (j=0; j<n; ++j) { 39 brow = idx[j]/bs; /* convert the indices into block indices */ 40 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 41 if (!PetscBTLookupSet(table_out,brow)) { 42 nidx[isz++] = brow; 43 if (bcol_max < brow) bcol_max = brow; 44 } 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++) { /* for each overlap */ 51 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 52 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 53 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 54 55 n = isz; /* length of the updated is[i] */ 56 for (brow=0; brow<mbs; brow++) { 57 start = ai[brow]; end = ai[brow+1]; 58 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 59 for (l = start; l<end; l++) { 60 bcol = aj[l]; 61 if (!PetscBTLookupSet(table_out,bcol)) { 62 nidx[isz++] = bcol; 63 if (bcol_max < bcol) bcol_max = bcol; 64 } 65 } 66 k++; 67 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 68 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 69 for (l = start; l<end; l++) { 70 bcol = aj[l]; 71 if (bcol > bcol_max) break; 72 if (PetscBTLookup(table_in,bcol)) { 73 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 74 break; /* for l = start; l<end ; l++) */ 75 } 76 } 77 } 78 } 79 } /* for each overlap */ 80 81 /* expand the Index Set */ 82 for (j=0; j<isz; j++) { 83 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 84 } 85 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 86 } 87 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 88 ierr = PetscFree(nidx);CHKERRQ(ierr); 89 ierr = PetscFree(nidx2);CHKERRQ(ierr); 90 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 95 Zero some ops' to avoid invalid usse */ 96 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 97 { 98 PetscErrorCode ierr; 99 100 PetscFunctionBegin; 101 ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 102 Bseq->ops->mult = 0; 103 Bseq->ops->multadd = 0; 104 Bseq->ops->multtranspose = 0; 105 Bseq->ops->multtransposeadd = 0; 106 Bseq->ops->lufactor = 0; 107 Bseq->ops->choleskyfactor = 0; 108 Bseq->ops->lufactorsymbolic = 0; 109 Bseq->ops->choleskyfactorsymbolic = 0; 110 Bseq->ops->getinertia = 0; 111 PetscFunctionReturn(0); 112 } 113 114 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 115 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 116 { 117 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 118 PetscErrorCode ierr; 119 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 120 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 121 const PetscInt *irow,*icol; 122 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 123 PetscInt *aj = a->j,*ai = a->i; 124 MatScalar *mat_a; 125 Mat C; 126 PetscBool flag; 127 128 PetscFunctionBegin; 129 130 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 131 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 132 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 133 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 134 135 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 136 ssmap = smap; 137 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 138 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 139 /* determine lens of each row */ 140 for (i=0; i<nrows; i++) { 141 kstart = ai[irow[i]]; 142 kend = kstart + a->ilen[irow[i]]; 143 lens[i] = 0; 144 for (k=kstart; k<kend; k++) { 145 if (ssmap[aj[k]]) lens[i]++; 146 } 147 } 148 /* Create and fill new matrix */ 149 if (scall == MAT_REUSE_MATRIX) { 150 c = (Mat_SeqSBAIJ*)((*B)->data); 151 152 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 153 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 154 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 155 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 156 C = *B; 157 } else { 158 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 159 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 160 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 161 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 162 } 163 c = (Mat_SeqSBAIJ*)(C->data); 164 for (i=0; i<nrows; i++) { 165 row = irow[i]; 166 kstart = ai[row]; 167 kend = kstart + a->ilen[row]; 168 mat_i = c->i[i]; 169 mat_j = c->j + mat_i; 170 mat_a = c->a + mat_i*bs2; 171 mat_ilen = c->ilen + i; 172 for (k=kstart; k<kend; k++) { 173 if ((tcol=ssmap[a->j[k]])) { 174 *mat_j++ = tcol - 1; 175 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 176 mat_a += bs2; 177 (*mat_ilen)++; 178 } 179 } 180 } 181 /* sort */ 182 { 183 MatScalar *work; 184 185 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 186 for (i=0; i<nrows; i++) { 187 PetscInt ilen; 188 mat_i = c->i[i]; 189 mat_j = c->j + mat_i; 190 mat_a = c->a + mat_i*bs2; 191 ilen = c->ilen[i]; 192 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 193 } 194 ierr = PetscFree(work);CHKERRQ(ierr); 195 } 196 197 /* Free work space */ 198 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 199 ierr = PetscFree(smap);CHKERRQ(ierr); 200 ierr = PetscFree(lens);CHKERRQ(ierr); 201 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 202 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 203 204 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205 *B = C; 206 PetscFunctionReturn(0); 207 } 208 209 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 210 { 211 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 212 IS is1,is2; 213 PetscErrorCode ierr; 214 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 215 const PetscInt *irow,*icol; 216 217 PetscFunctionBegin; 218 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 219 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 220 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 221 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 222 223 /* Verify if the indices corespond to each element in a block 224 and form the IS with compressed IS */ 225 maxmnbs = PetscMax(a->mbs,a->nbs); 226 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 227 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 228 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 229 for (i=0; i<a->mbs; i++) { 230 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 231 } 232 count = 0; 233 for (i=0; i<nrows; i++) { 234 PetscInt j = irow[i] / bs; 235 if ((vary[j]--)==bs) iary[count++] = j; 236 } 237 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 238 239 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 240 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 241 for (i=0; i<a->nbs; i++) { 242 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 243 } 244 count = 0; 245 for (i=0; i<ncols; i++) { 246 PetscInt j = icol[i] / bs; 247 if ((vary[j]--)==bs) iary[count++] = j; 248 } 249 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 250 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 251 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 252 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 253 254 ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 255 ierr = ISDestroy(&is1);CHKERRQ(ierr); 256 ierr = ISDestroy(&is2);CHKERRQ(ierr); 257 258 if (isrow != iscol) { 259 PetscBool isequal; 260 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 261 if (!isequal) { 262 ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 263 } 264 } 265 PetscFunctionReturn(0); 266 } 267 268 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 269 { 270 PetscErrorCode ierr; 271 PetscInt i; 272 273 PetscFunctionBegin; 274 if (scall == MAT_INITIAL_MATRIX) { 275 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 276 } 277 278 for (i=0; i<n; i++) { 279 ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 280 } 281 PetscFunctionReturn(0); 282 } 283 284 /* -------------------------------------------------------*/ 285 /* Should check that shapes of vectors and matrices match */ 286 /* -------------------------------------------------------*/ 287 288 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 289 { 290 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 291 PetscScalar *z,x1,x2,zero=0.0; 292 const PetscScalar *x,*xb; 293 const MatScalar *v; 294 PetscErrorCode ierr; 295 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 296 const PetscInt *aj=a->j,*ai=a->i,*ib; 297 PetscInt nonzerorow=0; 298 299 PetscFunctionBegin; 300 ierr = VecSet(zz,zero);CHKERRQ(ierr); 301 if (!a->nz) PetscFunctionReturn(0); 302 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 303 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 304 305 v = a->a; 306 xb = x; 307 308 for (i=0; i<mbs; i++) { 309 n = ai[1] - ai[0]; /* length of i_th block row of A */ 310 x1 = xb[0]; x2 = xb[1]; 311 ib = aj + *ai; 312 jmin = 0; 313 nonzerorow += (n>0); 314 if (*ib == i) { /* (diag of A)*x */ 315 z[2*i] += v[0]*x1 + v[2]*x2; 316 z[2*i+1] += v[2]*x1 + v[3]*x2; 317 v += 4; jmin++; 318 } 319 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 320 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 321 for (j=jmin; j<n; j++) { 322 /* (strict lower triangular part of A)*x */ 323 cval = ib[j]*2; 324 z[cval] += v[0]*x1 + v[1]*x2; 325 z[cval+1] += v[2]*x1 + v[3]*x2; 326 /* (strict upper triangular part of A)*x */ 327 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 328 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 329 v += 4; 330 } 331 xb +=2; ai++; 332 } 333 334 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 335 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 336 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 337 PetscFunctionReturn(0); 338 } 339 340 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 341 { 342 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 343 PetscScalar *z,x1,x2,x3,zero=0.0; 344 const PetscScalar *x,*xb; 345 const MatScalar *v; 346 PetscErrorCode ierr; 347 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 348 const PetscInt *aj = a->j,*ai = a->i,*ib; 349 PetscInt nonzerorow=0; 350 351 PetscFunctionBegin; 352 ierr = VecSet(zz,zero);CHKERRQ(ierr); 353 if (!a->nz) PetscFunctionReturn(0); 354 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 355 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 356 357 v = a->a; 358 xb = x; 359 360 for (i=0; i<mbs; i++) { 361 n = ai[1] - ai[0]; /* length of i_th block row of A */ 362 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 363 ib = aj + *ai; 364 jmin = 0; 365 nonzerorow += (n>0); 366 if (*ib == i) { /* (diag of A)*x */ 367 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 368 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 369 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 370 v += 9; jmin++; 371 } 372 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 373 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 374 for (j=jmin; j<n; j++) { 375 /* (strict lower triangular part of A)*x */ 376 cval = ib[j]*3; 377 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 378 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 379 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 380 /* (strict upper triangular part of A)*x */ 381 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 382 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 383 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 384 v += 9; 385 } 386 xb +=3; ai++; 387 } 388 389 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 390 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 391 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 392 PetscFunctionReturn(0); 393 } 394 395 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 396 { 397 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 398 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 399 const PetscScalar *x,*xb; 400 const MatScalar *v; 401 PetscErrorCode ierr; 402 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 403 const PetscInt *aj = a->j,*ai = a->i,*ib; 404 PetscInt nonzerorow = 0; 405 406 PetscFunctionBegin; 407 ierr = VecSet(zz,zero);CHKERRQ(ierr); 408 if (!a->nz) PetscFunctionReturn(0); 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 411 412 v = a->a; 413 xb = x; 414 415 for (i=0; i<mbs; i++) { 416 n = ai[1] - ai[0]; /* length of i_th block row of A */ 417 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 418 ib = aj + *ai; 419 jmin = 0; 420 nonzerorow += (n>0); 421 if (*ib == i) { /* (diag of A)*x */ 422 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 423 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 424 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 425 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 426 v += 16; jmin++; 427 } 428 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 429 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 430 for (j=jmin; j<n; j++) { 431 /* (strict lower triangular part of A)*x */ 432 cval = ib[j]*4; 433 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 434 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 435 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 436 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 437 /* (strict upper triangular part of A)*x */ 438 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 439 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 440 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 441 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 442 v += 16; 443 } 444 xb +=4; ai++; 445 } 446 447 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 448 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 449 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 450 PetscFunctionReturn(0); 451 } 452 453 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 454 { 455 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 456 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 457 const PetscScalar *x,*xb; 458 const MatScalar *v; 459 PetscErrorCode ierr; 460 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 461 const PetscInt *aj = a->j,*ai = a->i,*ib; 462 PetscInt nonzerorow=0; 463 464 PetscFunctionBegin; 465 ierr = VecSet(zz,zero);CHKERRQ(ierr); 466 if (!a->nz) PetscFunctionReturn(0); 467 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 468 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 469 470 v = a->a; 471 xb = x; 472 473 for (i=0; i<mbs; i++) { 474 n = ai[1] - ai[0]; /* length of i_th block row of A */ 475 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 476 ib = aj + *ai; 477 jmin = 0; 478 nonzerorow += (n>0); 479 if (*ib == i) { /* (diag of A)*x */ 480 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 481 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 482 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 483 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 484 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 485 v += 25; jmin++; 486 } 487 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 488 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 489 for (j=jmin; j<n; j++) { 490 /* (strict lower triangular part of A)*x */ 491 cval = ib[j]*5; 492 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 493 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 494 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 495 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 496 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 497 /* (strict upper triangular part of A)*x */ 498 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 499 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 500 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 501 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 502 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 503 v += 25; 504 } 505 xb +=5; ai++; 506 } 507 508 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 509 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 510 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 511 PetscFunctionReturn(0); 512 } 513 514 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 515 { 516 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 517 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 518 const PetscScalar *x,*xb; 519 const MatScalar *v; 520 PetscErrorCode ierr; 521 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 522 const PetscInt *aj=a->j,*ai=a->i,*ib; 523 PetscInt nonzerorow=0; 524 525 PetscFunctionBegin; 526 ierr = VecSet(zz,zero);CHKERRQ(ierr); 527 if (!a->nz) PetscFunctionReturn(0); 528 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 529 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 530 531 v = a->a; 532 xb = x; 533 534 for (i=0; i<mbs; i++) { 535 n = ai[1] - ai[0]; /* length of i_th block row of A */ 536 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 537 ib = aj + *ai; 538 jmin = 0; 539 nonzerorow += (n>0); 540 if (*ib == i) { /* (diag of A)*x */ 541 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 542 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 543 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 544 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 545 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 546 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 547 v += 36; jmin++; 548 } 549 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 550 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 551 for (j=jmin; j<n; j++) { 552 /* (strict lower triangular part of A)*x */ 553 cval = ib[j]*6; 554 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 555 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 556 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 557 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 558 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 559 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 560 /* (strict upper triangular part of A)*x */ 561 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 562 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 563 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 564 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 565 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 566 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 567 v += 36; 568 } 569 xb +=6; ai++; 570 } 571 572 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 573 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 574 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 575 PetscFunctionReturn(0); 576 } 577 578 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 579 { 580 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 581 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 582 const PetscScalar *x,*xb; 583 const MatScalar *v; 584 PetscErrorCode ierr; 585 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 586 const PetscInt *aj=a->j,*ai=a->i,*ib; 587 PetscInt nonzerorow=0; 588 589 PetscFunctionBegin; 590 ierr = VecSet(zz,zero);CHKERRQ(ierr); 591 if (!a->nz) PetscFunctionReturn(0); 592 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 593 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 594 595 v = a->a; 596 xb = x; 597 598 for (i=0; i<mbs; i++) { 599 n = ai[1] - ai[0]; /* length of i_th block row of A */ 600 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 601 ib = aj + *ai; 602 jmin = 0; 603 nonzerorow += (n>0); 604 if (*ib == i) { /* (diag of A)*x */ 605 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 606 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 607 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 608 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 609 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 610 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 611 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 612 v += 49; jmin++; 613 } 614 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 615 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 616 for (j=jmin; j<n; j++) { 617 /* (strict lower triangular part of A)*x */ 618 cval = ib[j]*7; 619 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 620 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 621 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 622 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 623 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 624 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 625 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 626 /* (strict upper triangular part of A)*x */ 627 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 628 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 629 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 630 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 631 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 632 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 633 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 634 v += 49; 635 } 636 xb +=7; ai++; 637 } 638 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 639 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 640 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 641 PetscFunctionReturn(0); 642 } 643 644 /* 645 This will not work with MatScalar == float because it calls the BLAS 646 */ 647 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 648 { 649 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 650 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 651 const PetscScalar *x,*x_ptr,*xb; 652 const MatScalar *v; 653 PetscErrorCode ierr; 654 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 655 const PetscInt *idx,*aj,*ii; 656 PetscInt nonzerorow=0; 657 658 PetscFunctionBegin; 659 ierr = VecSet(zz,zero);CHKERRQ(ierr); 660 if (!a->nz) PetscFunctionReturn(0); 661 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 662 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 663 664 x_ptr = x; 665 z_ptr = z; 666 667 aj = a->j; 668 v = a->a; 669 ii = a->i; 670 671 if (!a->mult_work) { 672 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 673 } 674 work = a->mult_work; 675 676 for (i=0; i<mbs; i++) { 677 n = ii[1] - ii[0]; ncols = n*bs; 678 workt = work; idx=aj+ii[0]; 679 nonzerorow += (n>0); 680 681 /* upper triangular part */ 682 for (j=0; j<n; j++) { 683 xb = x_ptr + bs*(*idx++); 684 for (k=0; k<bs; k++) workt[k] = xb[k]; 685 workt += bs; 686 } 687 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 688 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 689 690 /* strict lower triangular part */ 691 idx = aj+ii[0]; 692 if (n && *idx == i) { 693 ncols -= bs; v += bs2; idx++; n--; 694 } 695 696 if (ncols > 0) { 697 workt = work; 698 ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr); 699 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 700 for (j=0; j<n; j++) { 701 zb = z_ptr + bs*(*idx++); 702 for (k=0; k<bs; k++) zb[k] += workt[k]; 703 workt += bs; 704 } 705 } 706 x += bs; v += n*bs2; z += bs; ii++; 707 } 708 709 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 710 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 711 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 712 PetscFunctionReturn(0); 713 } 714 715 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 716 { 717 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 718 PetscScalar *z,x1; 719 const PetscScalar *x,*xb; 720 const MatScalar *v; 721 PetscErrorCode ierr; 722 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 723 const PetscInt *aj=a->j,*ai=a->i,*ib; 724 PetscInt nonzerorow=0; 725 #if defined(PETSC_USE_COMPLEX) 726 const int aconj = A->hermitian; 727 #else 728 const int aconj = 0; 729 #endif 730 731 PetscFunctionBegin; 732 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 733 if (!a->nz) PetscFunctionReturn(0); 734 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 735 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 736 v = a->a; 737 xb = x; 738 739 for (i=0; i<mbs; i++) { 740 n = ai[1] - ai[0]; /* length of i_th row of A */ 741 x1 = xb[0]; 742 ib = aj + *ai; 743 jmin = 0; 744 nonzerorow += (n>0); 745 if (n && *ib == i) { /* (diag of A)*x */ 746 z[i] += *v++ * x[*ib++]; jmin++; 747 } 748 if (aconj) { 749 for (j=jmin; j<n; j++) { 750 cval = *ib; 751 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 752 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 753 } 754 } else { 755 for (j=jmin; j<n; j++) { 756 cval = *ib; 757 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 758 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 759 } 760 } 761 xb++; ai++; 762 } 763 764 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 765 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 766 767 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 768 PetscFunctionReturn(0); 769 } 770 771 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 772 { 773 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 774 PetscScalar *z,x1,x2; 775 const PetscScalar *x,*xb; 776 const MatScalar *v; 777 PetscErrorCode ierr; 778 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 779 const PetscInt *aj=a->j,*ai=a->i,*ib; 780 PetscInt nonzerorow=0; 781 782 PetscFunctionBegin; 783 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 784 if (!a->nz) PetscFunctionReturn(0); 785 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 786 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 787 788 v = a->a; 789 xb = x; 790 791 for (i=0; i<mbs; i++) { 792 n = ai[1] - ai[0]; /* length of i_th block row of A */ 793 x1 = xb[0]; x2 = xb[1]; 794 ib = aj + *ai; 795 jmin = 0; 796 nonzerorow += (n>0); 797 if (n && *ib == i) { /* (diag of A)*x */ 798 z[2*i] += v[0]*x1 + v[2]*x2; 799 z[2*i+1] += v[2]*x1 + v[3]*x2; 800 v += 4; jmin++; 801 } 802 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 803 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 804 for (j=jmin; j<n; j++) { 805 /* (strict lower triangular part of A)*x */ 806 cval = ib[j]*2; 807 z[cval] += v[0]*x1 + v[1]*x2; 808 z[cval+1] += v[2]*x1 + v[3]*x2; 809 /* (strict upper triangular part of A)*x */ 810 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 811 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 812 v += 4; 813 } 814 xb +=2; ai++; 815 } 816 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 817 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 818 819 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 820 PetscFunctionReturn(0); 821 } 822 823 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 824 { 825 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 826 PetscScalar *z,x1,x2,x3; 827 const PetscScalar *x,*xb; 828 const MatScalar *v; 829 PetscErrorCode ierr; 830 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 831 const PetscInt *aj=a->j,*ai=a->i,*ib; 832 PetscInt nonzerorow=0; 833 834 PetscFunctionBegin; 835 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 836 if (!a->nz) PetscFunctionReturn(0); 837 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 838 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 839 840 v = a->a; 841 xb = x; 842 843 for (i=0; i<mbs; i++) { 844 n = ai[1] - ai[0]; /* length of i_th block row of A */ 845 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 846 ib = aj + *ai; 847 jmin = 0; 848 nonzerorow += (n>0); 849 if (n && *ib == i) { /* (diag of A)*x */ 850 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 851 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 852 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 853 v += 9; jmin++; 854 } 855 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 856 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 857 for (j=jmin; j<n; j++) { 858 /* (strict lower triangular part of A)*x */ 859 cval = ib[j]*3; 860 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 861 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 862 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 863 /* (strict upper triangular part of A)*x */ 864 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 865 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 866 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 867 v += 9; 868 } 869 xb +=3; ai++; 870 } 871 872 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 873 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 874 875 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 876 PetscFunctionReturn(0); 877 } 878 879 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 880 { 881 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 882 PetscScalar *z,x1,x2,x3,x4; 883 const PetscScalar *x,*xb; 884 const MatScalar *v; 885 PetscErrorCode ierr; 886 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 887 const PetscInt *aj=a->j,*ai=a->i,*ib; 888 PetscInt nonzerorow=0; 889 890 PetscFunctionBegin; 891 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 892 if (!a->nz) PetscFunctionReturn(0); 893 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 894 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 895 896 v = a->a; 897 xb = x; 898 899 for (i=0; i<mbs; i++) { 900 n = ai[1] - ai[0]; /* length of i_th block row of A */ 901 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 902 ib = aj + *ai; 903 jmin = 0; 904 nonzerorow += (n>0); 905 if (n && *ib == i) { /* (diag of A)*x */ 906 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 907 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 908 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 909 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 910 v += 16; jmin++; 911 } 912 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 913 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 914 for (j=jmin; j<n; j++) { 915 /* (strict lower triangular part of A)*x */ 916 cval = ib[j]*4; 917 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 918 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 919 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 920 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 921 /* (strict upper triangular part of A)*x */ 922 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 923 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 924 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 925 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 926 v += 16; 927 } 928 xb +=4; ai++; 929 } 930 931 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 932 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 933 934 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 935 PetscFunctionReturn(0); 936 } 937 938 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 939 { 940 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 941 PetscScalar *z,x1,x2,x3,x4,x5; 942 const PetscScalar *x,*xb; 943 const MatScalar *v; 944 PetscErrorCode ierr; 945 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 946 const PetscInt *aj=a->j,*ai=a->i,*ib; 947 PetscInt nonzerorow=0; 948 949 PetscFunctionBegin; 950 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 951 if (!a->nz) PetscFunctionReturn(0); 952 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 953 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 954 955 v = a->a; 956 xb = x; 957 958 for (i=0; i<mbs; i++) { 959 n = ai[1] - ai[0]; /* length of i_th block row of A */ 960 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 961 ib = aj + *ai; 962 jmin = 0; 963 nonzerorow += (n>0); 964 if (n && *ib == i) { /* (diag of A)*x */ 965 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 966 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 967 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 968 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 969 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 970 v += 25; jmin++; 971 } 972 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 973 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 974 for (j=jmin; j<n; j++) { 975 /* (strict lower triangular part of A)*x */ 976 cval = ib[j]*5; 977 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 978 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 979 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 980 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 981 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 982 /* (strict upper triangular part of A)*x */ 983 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 984 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 985 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 986 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 987 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 988 v += 25; 989 } 990 xb +=5; ai++; 991 } 992 993 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 994 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 995 996 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 997 PetscFunctionReturn(0); 998 } 999 1000 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1001 { 1002 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1003 PetscScalar *z,x1,x2,x3,x4,x5,x6; 1004 const PetscScalar *x,*xb; 1005 const MatScalar *v; 1006 PetscErrorCode ierr; 1007 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1008 const PetscInt *aj=a->j,*ai=a->i,*ib; 1009 PetscInt nonzerorow=0; 1010 1011 PetscFunctionBegin; 1012 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1013 if (!a->nz) PetscFunctionReturn(0); 1014 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1015 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1016 1017 v = a->a; 1018 xb = x; 1019 1020 for (i=0; i<mbs; i++) { 1021 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1022 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1023 ib = aj + *ai; 1024 jmin = 0; 1025 nonzerorow += (n>0); 1026 if (n && *ib == i) { /* (diag of A)*x */ 1027 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1028 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1029 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1030 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1031 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1032 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1033 v += 36; jmin++; 1034 } 1035 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1036 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1037 for (j=jmin; j<n; j++) { 1038 /* (strict lower triangular part of A)*x */ 1039 cval = ib[j]*6; 1040 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1041 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1042 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1043 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1044 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1045 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1046 /* (strict upper triangular part of A)*x */ 1047 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1048 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1049 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1050 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1051 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1052 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1053 v += 36; 1054 } 1055 xb +=6; ai++; 1056 } 1057 1058 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1059 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1060 1061 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1062 PetscFunctionReturn(0); 1063 } 1064 1065 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1066 { 1067 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1068 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1069 const PetscScalar *x,*xb; 1070 const MatScalar *v; 1071 PetscErrorCode ierr; 1072 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1073 const PetscInt *aj=a->j,*ai=a->i,*ib; 1074 PetscInt nonzerorow=0; 1075 1076 PetscFunctionBegin; 1077 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1078 if (!a->nz) PetscFunctionReturn(0); 1079 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1080 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1081 1082 v = a->a; 1083 xb = x; 1084 1085 for (i=0; i<mbs; i++) { 1086 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1087 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1088 ib = aj + *ai; 1089 jmin = 0; 1090 nonzerorow += (n>0); 1091 if (n && *ib == i) { /* (diag of A)*x */ 1092 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1093 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1094 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1095 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1096 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1097 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1098 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1099 v += 49; jmin++; 1100 } 1101 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1102 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1103 for (j=jmin; j<n; j++) { 1104 /* (strict lower triangular part of A)*x */ 1105 cval = ib[j]*7; 1106 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1107 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1108 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1109 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1110 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1111 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1112 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1113 /* (strict upper triangular part of A)*x */ 1114 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1115 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1116 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1117 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1118 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1119 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1120 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1121 v += 49; 1122 } 1123 xb +=7; ai++; 1124 } 1125 1126 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1127 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1128 1129 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1130 PetscFunctionReturn(0); 1131 } 1132 1133 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1134 { 1135 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1136 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1137 const PetscScalar *x,*x_ptr,*xb; 1138 const MatScalar *v; 1139 PetscErrorCode ierr; 1140 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1141 const PetscInt *idx,*aj,*ii; 1142 PetscInt nonzerorow=0; 1143 1144 PetscFunctionBegin; 1145 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1146 if (!a->nz) PetscFunctionReturn(0); 1147 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1148 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1149 1150 aj = a->j; 1151 v = a->a; 1152 ii = a->i; 1153 1154 if (!a->mult_work) { 1155 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1156 } 1157 work = a->mult_work; 1158 1159 1160 for (i=0; i<mbs; i++) { 1161 n = ii[1] - ii[0]; ncols = n*bs; 1162 workt = work; idx=aj+ii[0]; 1163 nonzerorow += (n>0); 1164 1165 /* upper triangular part */ 1166 for (j=0; j<n; j++) { 1167 xb = x_ptr + bs*(*idx++); 1168 for (k=0; k<bs; k++) workt[k] = xb[k]; 1169 workt += bs; 1170 } 1171 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1172 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1173 1174 /* strict lower triangular part */ 1175 idx = aj+ii[0]; 1176 if (n && *idx == i) { 1177 ncols -= bs; v += bs2; idx++; n--; 1178 } 1179 if (ncols > 0) { 1180 workt = work; 1181 ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr); 1182 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1183 for (j=0; j<n; j++) { 1184 zb = z_ptr + bs*(*idx++); 1185 for (k=0; k<bs; k++) zb[k] += workt[k]; 1186 workt += bs; 1187 } 1188 } 1189 1190 x += bs; v += n*bs2; z += bs; ii++; 1191 } 1192 1193 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1194 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1195 1196 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1197 PetscFunctionReturn(0); 1198 } 1199 1200 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1201 { 1202 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1203 PetscScalar oalpha = alpha; 1204 PetscErrorCode ierr; 1205 PetscBLASInt one = 1,totalnz; 1206 1207 PetscFunctionBegin; 1208 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1209 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1210 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1211 PetscFunctionReturn(0); 1212 } 1213 1214 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1215 { 1216 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1217 const MatScalar *v = a->a; 1218 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1219 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1220 PetscErrorCode ierr; 1221 const PetscInt *aj=a->j,*col; 1222 1223 PetscFunctionBegin; 1224 if (!a->nz) { 1225 *norm = 0.0; 1226 PetscFunctionReturn(0); 1227 } 1228 if (type == NORM_FROBENIUS) { 1229 for (k=0; k<mbs; k++) { 1230 jmin = a->i[k]; 1231 jmax = a->i[k+1]; 1232 col = aj + jmin; 1233 if (jmax-jmin > 0 && *col == k) { /* diagonal block */ 1234 for (i=0; i<bs2; i++) { 1235 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1236 } 1237 jmin++; 1238 } 1239 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1240 for (i=0; i<bs2; i++) { 1241 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1242 } 1243 } 1244 } 1245 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1246 ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr); 1247 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1248 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1249 for (i=0; i<mbs; i++) jl[i] = mbs; 1250 il[0] = 0; 1251 1252 *norm = 0.0; 1253 for (k=0; k<mbs; k++) { /* k_th block row */ 1254 for (j=0; j<bs; j++) sum[j]=0.0; 1255 /*-- col sum --*/ 1256 i = jl[k]; /* first |A(i,k)| to be added */ 1257 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1258 at step k */ 1259 while (i<mbs) { 1260 nexti = jl[i]; /* next block row to be added */ 1261 ik = il[i]; /* block index of A(i,k) in the array a */ 1262 for (j=0; j<bs; j++) { 1263 v = a->a + ik*bs2 + j*bs; 1264 for (k1=0; k1<bs; k1++) { 1265 sum[j] += PetscAbsScalar(*v); v++; 1266 } 1267 } 1268 /* update il, jl */ 1269 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1270 jmax = a->i[i+1]; 1271 if (jmin < jmax) { 1272 il[i] = jmin; 1273 j = a->j[jmin]; 1274 jl[i] = jl[j]; jl[j]=i; 1275 } 1276 i = nexti; 1277 } 1278 /*-- row sum --*/ 1279 jmin = a->i[k]; 1280 jmax = a->i[k+1]; 1281 for (i=jmin; i<jmax; i++) { 1282 for (j=0; j<bs; j++) { 1283 v = a->a + i*bs2 + j; 1284 for (k1=0; k1<bs; k1++) { 1285 sum[j] += PetscAbsScalar(*v); v += bs; 1286 } 1287 } 1288 } 1289 /* add k_th block row to il, jl */ 1290 col = aj+jmin; 1291 if (jmax - jmin > 0 && *col == k) jmin++; 1292 if (jmin < jmax) { 1293 il[k] = jmin; 1294 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1295 } 1296 for (j=0; j<bs; j++) { 1297 if (sum[j] > *norm) *norm = sum[j]; 1298 } 1299 } 1300 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1301 ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1302 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1303 PetscFunctionReturn(0); 1304 } 1305 1306 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1307 { 1308 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1309 PetscErrorCode ierr; 1310 1311 PetscFunctionBegin; 1312 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1313 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1314 *flg = PETSC_FALSE; 1315 PetscFunctionReturn(0); 1316 } 1317 1318 /* if the a->i are the same */ 1319 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 1320 if (!*flg) PetscFunctionReturn(0); 1321 1322 /* if a->j are the same */ 1323 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 1324 if (!*flg) PetscFunctionReturn(0); 1325 1326 /* if a->a are the same */ 1327 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr); 1328 PetscFunctionReturn(0); 1329 } 1330 1331 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1332 { 1333 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1334 PetscErrorCode ierr; 1335 PetscInt i,j,k,row,bs,ambs,bs2; 1336 const PetscInt *ai,*aj; 1337 PetscScalar *x,zero = 0.0; 1338 const MatScalar *aa,*aa_j; 1339 1340 PetscFunctionBegin; 1341 bs = A->rmap->bs; 1342 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1343 1344 aa = a->a; 1345 ambs = a->mbs; 1346 1347 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1348 PetscInt *diag=a->diag; 1349 aa = a->a; 1350 ambs = a->mbs; 1351 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1352 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1353 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1354 PetscFunctionReturn(0); 1355 } 1356 1357 ai = a->i; 1358 aj = a->j; 1359 bs2 = a->bs2; 1360 ierr = VecSet(v,zero);CHKERRQ(ierr); 1361 if (!a->nz) PetscFunctionReturn(0); 1362 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1363 for (i=0; i<ambs; i++) { 1364 j = ai[i]; 1365 if (aj[j] == i) { /* if this is a diagonal element */ 1366 row = i*bs; 1367 aa_j = aa + j*bs2; 1368 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1369 } 1370 } 1371 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1372 PetscFunctionReturn(0); 1373 } 1374 1375 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1376 { 1377 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1378 PetscScalar x; 1379 const PetscScalar *l,*li,*ri; 1380 MatScalar *aa,*v; 1381 PetscErrorCode ierr; 1382 PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2; 1383 const PetscInt *ai,*aj; 1384 PetscBool flg; 1385 1386 PetscFunctionBegin; 1387 if (ll != rr) { 1388 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1389 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1390 } 1391 if (!ll) PetscFunctionReturn(0); 1392 ai = a->i; 1393 aj = a->j; 1394 aa = a->a; 1395 m = A->rmap->N; 1396 bs = A->rmap->bs; 1397 mbs = a->mbs; 1398 bs2 = a->bs2; 1399 1400 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1401 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1402 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1403 for (i=0; i<mbs; i++) { /* for each block row */ 1404 M = ai[i+1] - ai[i]; 1405 li = l + i*bs; 1406 v = aa + bs2*ai[i]; 1407 for (j=0; j<M; j++) { /* for each block */ 1408 ri = l + bs*aj[ai[i]+j]; 1409 for (k=0; k<bs; k++) { 1410 x = ri[k]; 1411 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1412 } 1413 } 1414 } 1415 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1416 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1417 PetscFunctionReturn(0); 1418 } 1419 1420 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1421 { 1422 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1423 1424 PetscFunctionBegin; 1425 info->block_size = a->bs2; 1426 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1427 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1428 info->nz_unneeded = info->nz_allocated - info->nz_used; 1429 info->assemblies = A->num_ass; 1430 info->mallocs = A->info.mallocs; 1431 info->memory = ((PetscObject)A)->mem; 1432 if (A->factortype) { 1433 info->fill_ratio_given = A->info.fill_ratio_given; 1434 info->fill_ratio_needed = A->info.fill_ratio_needed; 1435 info->factor_mallocs = A->info.factor_mallocs; 1436 } else { 1437 info->fill_ratio_given = 0; 1438 info->fill_ratio_needed = 0; 1439 info->factor_mallocs = 0; 1440 } 1441 PetscFunctionReturn(0); 1442 } 1443 1444 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1445 { 1446 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1447 PetscErrorCode ierr; 1448 1449 PetscFunctionBegin; 1450 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 1451 PetscFunctionReturn(0); 1452 } 1453 1454 /* 1455 This code does not work since it only checks the upper triangular part of 1456 the matrix. Hence it is not listed in the function table. 1457 */ 1458 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1459 { 1460 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1461 PetscErrorCode ierr; 1462 PetscInt i,j,n,row,col,bs,mbs; 1463 const PetscInt *ai,*aj; 1464 PetscReal atmp; 1465 const MatScalar *aa; 1466 PetscScalar *x; 1467 PetscInt ncols,brow,bcol,krow,kcol; 1468 1469 PetscFunctionBegin; 1470 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1471 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1472 bs = A->rmap->bs; 1473 aa = a->a; 1474 ai = a->i; 1475 aj = a->j; 1476 mbs = a->mbs; 1477 1478 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1479 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1480 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1481 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1482 for (i=0; i<mbs; i++) { 1483 ncols = ai[1] - ai[0]; ai++; 1484 brow = bs*i; 1485 for (j=0; j<ncols; j++) { 1486 bcol = bs*(*aj); 1487 for (kcol=0; kcol<bs; kcol++) { 1488 col = bcol + kcol; /* col index */ 1489 for (krow=0; krow<bs; krow++) { 1490 atmp = PetscAbsScalar(*aa); aa++; 1491 row = brow + krow; /* row index */ 1492 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1493 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1494 } 1495 } 1496 aj++; 1497 } 1498 } 1499 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1500 PetscFunctionReturn(0); 1501 } 1502 1503 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 1504 { 1505 PetscErrorCode ierr; 1506 1507 PetscFunctionBegin; 1508 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 1509 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1510 PetscFunctionReturn(0); 1511 } 1512 1513 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1514 { 1515 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1516 PetscScalar *z = c; 1517 const PetscScalar *xb; 1518 PetscScalar x1; 1519 const MatScalar *v = a->a,*vv; 1520 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1521 #if defined(PETSC_USE_COMPLEX) 1522 const int aconj = A->hermitian; 1523 #else 1524 const int aconj = 0; 1525 #endif 1526 1527 PetscFunctionBegin; 1528 for (i=0; i<mbs; i++) { 1529 n = ii[1] - ii[0]; ii++; 1530 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1531 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1532 jj = idx; 1533 vv = v; 1534 for (k=0; k<cn; k++) { 1535 idx = jj; 1536 v = vv; 1537 for (j=0; j<n; j++) { 1538 xb = b + (*idx); x1 = xb[0+k*bm]; 1539 z[0+k*cm] += v[0]*x1; 1540 if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm]; 1541 v += 1; 1542 ++idx; 1543 } 1544 } 1545 z += 1; 1546 } 1547 PetscFunctionReturn(0); 1548 } 1549 1550 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1551 { 1552 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1553 PetscScalar *z = c; 1554 const PetscScalar *xb; 1555 PetscScalar x1,x2; 1556 const MatScalar *v = a->a,*vv; 1557 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1558 1559 PetscFunctionBegin; 1560 for (i=0; i<mbs; i++) { 1561 n = ii[1] - ii[0]; ii++; 1562 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1563 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1564 jj = idx; 1565 vv = v; 1566 for (k=0; k<cn; k++) { 1567 idx = jj; 1568 v = vv; 1569 for (j=0; j<n; j++) { 1570 xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 1571 z[0+k*cm] += v[0]*x1 + v[2]*x2; 1572 z[1+k*cm] += v[1]*x1 + v[3]*x2; 1573 if (*idx != i) { 1574 c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm]; 1575 c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm]; 1576 } 1577 v += 4; 1578 ++idx; 1579 } 1580 } 1581 z += 2; 1582 } 1583 PetscFunctionReturn(0); 1584 } 1585 1586 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1587 { 1588 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1589 PetscScalar *z = c; 1590 const PetscScalar *xb; 1591 PetscScalar x1,x2,x3; 1592 const MatScalar *v = a->a,*vv; 1593 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1594 1595 PetscFunctionBegin; 1596 for (i=0; i<mbs; i++) { 1597 n = ii[1] - ii[0]; ii++; 1598 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1599 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1600 jj = idx; 1601 vv = v; 1602 for (k=0; k<cn; k++) { 1603 idx = jj; 1604 v = vv; 1605 for (j=0; j<n; j++) { 1606 xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 1607 z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3; 1608 z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3; 1609 z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3; 1610 if (*idx != i) { 1611 c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm]; 1612 c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm]; 1613 c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm]; 1614 } 1615 v += 9; 1616 ++idx; 1617 } 1618 } 1619 z += 3; 1620 } 1621 PetscFunctionReturn(0); 1622 } 1623 1624 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1625 { 1626 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1627 PetscScalar *z = c; 1628 const PetscScalar *xb; 1629 PetscScalar x1,x2,x3,x4; 1630 const MatScalar *v = a->a,*vv; 1631 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1632 1633 PetscFunctionBegin; 1634 for (i=0; i<mbs; i++) { 1635 n = ii[1] - ii[0]; ii++; 1636 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1637 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1638 jj = idx; 1639 vv = v; 1640 for (k=0; k<cn; k++) { 1641 idx = jj; 1642 v = vv; 1643 for (j=0; j<n; j++) { 1644 xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 1645 z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1646 z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1647 z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1648 z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1649 if (*idx != i) { 1650 c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm]; 1651 c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm]; 1652 c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm]; 1653 c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm]; 1654 } 1655 v += 16; 1656 ++idx; 1657 } 1658 } 1659 z += 4; 1660 } 1661 PetscFunctionReturn(0); 1662 } 1663 1664 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1665 { 1666 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1667 PetscScalar *z = c; 1668 const PetscScalar *xb; 1669 PetscScalar x1,x2,x3,x4,x5; 1670 const MatScalar *v = a->a,*vv; 1671 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1672 1673 PetscFunctionBegin; 1674 for (i=0; i<mbs; i++) { 1675 n = ii[1] - ii[0]; ii++; 1676 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1677 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1678 jj = idx; 1679 vv = v; 1680 for (k=0; k<cn; k++) { 1681 idx = jj; 1682 v = vv; 1683 for (j=0; j<n; j++) { 1684 xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm]; 1685 z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1686 z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1687 z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1688 z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1689 z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1690 if (*idx != i) { 1691 c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm]; 1692 c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm]; 1693 c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm]; 1694 c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm]; 1695 c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm]; 1696 } 1697 v += 25; 1698 ++idx; 1699 } 1700 } 1701 z += 5; 1702 } 1703 PetscFunctionReturn(0); 1704 } 1705 1706 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C) 1707 { 1708 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1709 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 1710 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 1711 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 1712 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1713 PetscBLASInt bbs,bcn,bbm,bcm; 1714 PetscScalar *z = 0; 1715 PetscScalar *c,*b; 1716 const MatScalar *v; 1717 const PetscInt *idx,*ii; 1718 PetscScalar _DOne=1.0; 1719 PetscErrorCode ierr; 1720 1721 PetscFunctionBegin; 1722 if (!cm || !cn) PetscFunctionReturn(0); 1723 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 1724 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 1725 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 1726 b = bd->v; 1727 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1728 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1729 switch (bs) { 1730 case 1: 1731 ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn); 1732 break; 1733 case 2: 1734 ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn); 1735 break; 1736 case 3: 1737 ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn); 1738 break; 1739 case 4: 1740 ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn); 1741 break; 1742 case 5: 1743 ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn); 1744 break; 1745 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1746 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 1747 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 1748 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 1749 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 1750 idx = a->j; 1751 v = a->a; 1752 mbs = a->mbs; 1753 ii = a->i; 1754 z = c; 1755 for (i=0; i<mbs; i++) { 1756 n = ii[1] - ii[0]; ii++; 1757 for (j=0; j<n; j++) { 1758 if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm)); 1759 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 1760 v += bs2; 1761 } 1762 z += bs; 1763 } 1764 } 1765 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 1766 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1767 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1768 ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr); 1769 PetscFunctionReturn(0); 1770 } 1771