1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table_out,table_in; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n; ++j) { 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if (!PetscBTLookupSet(table_out,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap */ 52 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++) { 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end; l++) { 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table_out,bcol)) { 63 nidx[isz++] = bcol; 64 if (bcol_max < bcol) bcol_max = bcol; 65 } 66 } 67 k++; 68 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70 for (l = start; l<end; l++) { 71 bcol = aj[l]; 72 if (bcol > bcol_max) break; 73 if (PetscBTLookup(table_in,bcol)) { 74 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75 break; /* for l = start; l<end ; l++) */ 76 } 77 } 78 } 79 } 80 } /* for each overlap */ 81 82 /* expand the Index Set */ 83 for (j=0; j<isz; j++) { 84 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85 } 86 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87 } 88 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89 ierr = PetscFree(nidx);CHKERRQ(ierr); 90 ierr = PetscFree(nidx2);CHKERRQ(ierr); 91 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 92 PetscFunctionReturn(0); 93 } 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,MatReuse scall,Mat *B) 98 { 99 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 100 PetscErrorCode ierr; 101 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 102 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 103 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 104 const PetscInt *irow,*aj = a->j,*ai = a->i; 105 MatScalar *mat_a; 106 Mat C; 107 PetscBool flag,sorted; 108 109 PetscFunctionBegin; 110 ierr = ISSorted(isrow,&sorted);CHKERRQ(ierr); 111 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 112 113 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 114 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 115 116 ierr = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr); 117 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 118 ssmap = smap; 119 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 120 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 121 /* determine lens of each row */ 122 for (i=0; i<nrows; i++) { 123 kstart = ai[irow[i]]; 124 kend = kstart + a->ilen[irow[i]]; 125 lens[i] = 0; 126 for (k=kstart; k<kend; k++) { 127 if (ssmap[aj[k]]) lens[i]++; 128 } 129 } 130 /* Create and fill new matrix */ 131 if (scall == MAT_REUSE_MATRIX) { 132 c = (Mat_SeqSBAIJ*)((*B)->data); 133 134 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 135 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 136 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 137 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 138 C = *B; 139 } else { 140 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 141 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 142 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 143 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 144 } 145 c = (Mat_SeqSBAIJ*)(C->data); 146 for (i=0; i<nrows; i++) { 147 row = irow[i]; 148 kstart = ai[row]; 149 kend = kstart + a->ilen[row]; 150 mat_i = c->i[i]; 151 mat_j = c->j + mat_i; 152 mat_a = c->a + mat_i*bs2; 153 mat_ilen = c->ilen + i; 154 for (k=kstart; k<kend; k++) { 155 if ((tcol=ssmap[a->j[k]])) { 156 *mat_j++ = tcol - 1; 157 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 158 mat_a += bs2; 159 (*mat_ilen)++; 160 } 161 } 162 } 163 164 /* Free work space */ 165 ierr = PetscFree(smap);CHKERRQ(ierr); 166 ierr = PetscFree(lens);CHKERRQ(ierr); 167 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 168 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 169 170 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 171 *B = C; 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 177 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 178 { 179 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 180 IS is1; 181 PetscErrorCode ierr; 182 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 183 const PetscInt *irow; 184 185 PetscFunctionBegin; 186 if (isrow != iscol) { 187 PetscBool isequal; 188 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 189 if (!isequal) 190 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow"); 191 } 192 193 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 194 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 195 196 /* Verify if the indices corespond to each element in a block 197 and form the IS with compressed IS */ 198 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 199 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 200 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 201 202 count = 0; 203 for (i=0; i<a->mbs; i++) { 204 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 205 if (vary[i]==bs) iary[count++] = i; 206 } 207 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 208 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 209 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 210 211 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,scall,B);CHKERRQ(ierr); 212 ierr = ISDestroy(&is1);CHKERRQ(ierr); 213 PetscFunctionReturn(0); 214 } 215 216 #undef __FUNCT__ 217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 219 { 220 PetscErrorCode ierr; 221 PetscInt i; 222 223 PetscFunctionBegin; 224 if (scall == MAT_INITIAL_MATRIX) { 225 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 226 } 227 228 for (i=0; i<n; i++) { 229 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 230 } 231 PetscFunctionReturn(0); 232 } 233 234 /* -------------------------------------------------------*/ 235 /* Should check that shapes of vectors and matrices match */ 236 /* -------------------------------------------------------*/ 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 240 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 241 { 242 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 243 PetscScalar *z,x1,x2,zero=0.0; 244 const PetscScalar *x,*xb; 245 const MatScalar *v; 246 PetscErrorCode ierr; 247 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 248 const PetscInt *aj=a->j,*ai=a->i,*ib; 249 PetscInt nonzerorow=0; 250 251 PetscFunctionBegin; 252 ierr = VecSet(zz,zero);CHKERRQ(ierr); 253 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 254 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 255 256 v = a->a; 257 xb = x; 258 259 for (i=0; i<mbs; i++) { 260 n = ai[1] - ai[0]; /* length of i_th block row of A */ 261 x1 = xb[0]; x2 = xb[1]; 262 ib = aj + *ai; 263 jmin = 0; 264 nonzerorow += (n>0); 265 if (*ib == i) { /* (diag of A)*x */ 266 z[2*i] += v[0]*x1 + v[2]*x2; 267 z[2*i+1] += v[2]*x1 + v[3]*x2; 268 v += 4; jmin++; 269 } 270 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 271 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 272 for (j=jmin; j<n; j++) { 273 /* (strict lower triangular part of A)*x */ 274 cval = ib[j]*2; 275 z[cval] += v[0]*x1 + v[1]*x2; 276 z[cval+1] += v[2]*x1 + v[3]*x2; 277 /* (strict upper triangular part of A)*x */ 278 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 279 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 280 v += 4; 281 } 282 xb +=2; ai++; 283 } 284 285 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 286 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 287 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 288 PetscFunctionReturn(0); 289 } 290 291 #undef __FUNCT__ 292 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 293 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 294 { 295 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 296 PetscScalar *z,x1,x2,x3,zero=0.0; 297 const PetscScalar *x,*xb; 298 const MatScalar *v; 299 PetscErrorCode ierr; 300 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 301 const PetscInt *aj = a->j,*ai = a->i,*ib; 302 PetscInt nonzerorow=0; 303 304 PetscFunctionBegin; 305 ierr = VecSet(zz,zero);CHKERRQ(ierr); 306 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 307 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 308 309 v = a->a; 310 xb = x; 311 312 for (i=0; i<mbs; i++) { 313 n = ai[1] - ai[0]; /* length of i_th block row of A */ 314 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 315 ib = aj + *ai; 316 jmin = 0; 317 nonzerorow += (n>0); 318 if (*ib == i) { /* (diag of A)*x */ 319 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 320 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 321 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 322 v += 9; jmin++; 323 } 324 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 325 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 326 for (j=jmin; j<n; j++) { 327 /* (strict lower triangular part of A)*x */ 328 cval = ib[j]*3; 329 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 330 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 331 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 332 /* (strict upper triangular part of A)*x */ 333 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 334 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 335 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 336 v += 9; 337 } 338 xb +=3; ai++; 339 } 340 341 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 342 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 343 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 344 PetscFunctionReturn(0); 345 } 346 347 #undef __FUNCT__ 348 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 349 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 350 { 351 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 352 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 353 const PetscScalar *x,*xb; 354 const MatScalar *v; 355 PetscErrorCode ierr; 356 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 357 const PetscInt *aj = a->j,*ai = a->i,*ib; 358 PetscInt nonzerorow = 0; 359 360 PetscFunctionBegin; 361 ierr = VecSet(zz,zero);CHKERRQ(ierr); 362 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 363 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 364 365 v = a->a; 366 xb = x; 367 368 for (i=0; i<mbs; i++) { 369 n = ai[1] - ai[0]; /* length of i_th block row of A */ 370 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 371 ib = aj + *ai; 372 jmin = 0; 373 nonzerorow += (n>0); 374 if (*ib == i) { /* (diag of A)*x */ 375 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 376 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 377 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 378 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 379 v += 16; jmin++; 380 } 381 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 382 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 383 for (j=jmin; j<n; j++) { 384 /* (strict lower triangular part of A)*x */ 385 cval = ib[j]*4; 386 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 387 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 388 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 389 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 390 /* (strict upper triangular part of A)*x */ 391 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 392 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 393 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 394 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 395 v += 16; 396 } 397 xb +=4; ai++; 398 } 399 400 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 401 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 402 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 403 PetscFunctionReturn(0); 404 } 405 406 #undef __FUNCT__ 407 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 408 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 409 { 410 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 411 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 412 const PetscScalar *x,*xb; 413 const MatScalar *v; 414 PetscErrorCode ierr; 415 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 416 const PetscInt *aj = a->j,*ai = a->i,*ib; 417 PetscInt nonzerorow=0; 418 419 PetscFunctionBegin; 420 ierr = VecSet(zz,zero);CHKERRQ(ierr); 421 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 422 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 423 424 v = a->a; 425 xb = x; 426 427 for (i=0; i<mbs; i++) { 428 n = ai[1] - ai[0]; /* length of i_th block row of A */ 429 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 430 ib = aj + *ai; 431 jmin = 0; 432 nonzerorow += (n>0); 433 if (*ib == i) { /* (diag of A)*x */ 434 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 435 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 436 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 437 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 438 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 439 v += 25; jmin++; 440 } 441 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 442 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 443 for (j=jmin; j<n; j++) { 444 /* (strict lower triangular part of A)*x */ 445 cval = ib[j]*5; 446 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 447 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 448 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 449 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 450 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 451 /* (strict upper triangular part of A)*x */ 452 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 453 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 454 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 455 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 456 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 457 v += 25; 458 } 459 xb +=5; ai++; 460 } 461 462 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 463 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 464 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 465 PetscFunctionReturn(0); 466 } 467 468 469 #undef __FUNCT__ 470 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 471 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 472 { 473 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 474 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 475 const PetscScalar *x,*xb; 476 const MatScalar *v; 477 PetscErrorCode ierr; 478 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 479 const PetscInt *aj=a->j,*ai=a->i,*ib; 480 PetscInt nonzerorow=0; 481 482 PetscFunctionBegin; 483 ierr = VecSet(zz,zero);CHKERRQ(ierr); 484 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 485 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 486 487 v = a->a; 488 xb = x; 489 490 for (i=0; i<mbs; i++) { 491 n = ai[1] - ai[0]; /* length of i_th block row of A */ 492 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 493 ib = aj + *ai; 494 jmin = 0; 495 nonzerorow += (n>0); 496 if (*ib == i) { /* (diag of A)*x */ 497 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 498 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 499 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 500 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 501 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 502 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 503 v += 36; jmin++; 504 } 505 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 506 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 507 for (j=jmin; j<n; j++) { 508 /* (strict lower triangular part of A)*x */ 509 cval = ib[j]*6; 510 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 511 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 512 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 513 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 514 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 515 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 516 /* (strict upper triangular part of A)*x */ 517 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 518 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 519 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 520 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 521 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 522 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 523 v += 36; 524 } 525 xb +=6; ai++; 526 } 527 528 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 529 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 530 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 531 PetscFunctionReturn(0); 532 } 533 #undef __FUNCT__ 534 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 535 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 536 { 537 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 538 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 539 const PetscScalar *x,*xb; 540 const MatScalar *v; 541 PetscErrorCode ierr; 542 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 543 const PetscInt *aj=a->j,*ai=a->i,*ib; 544 PetscInt nonzerorow=0; 545 546 PetscFunctionBegin; 547 ierr = VecSet(zz,zero);CHKERRQ(ierr); 548 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 549 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 550 551 v = a->a; 552 xb = x; 553 554 for (i=0; i<mbs; i++) { 555 n = ai[1] - ai[0]; /* length of i_th block row of A */ 556 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 557 ib = aj + *ai; 558 jmin = 0; 559 nonzerorow += (n>0); 560 if (*ib == i) { /* (diag of A)*x */ 561 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 562 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 563 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 564 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 565 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 566 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 567 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 568 v += 49; jmin++; 569 } 570 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 571 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 572 for (j=jmin; j<n; j++) { 573 /* (strict lower triangular part of A)*x */ 574 cval = ib[j]*7; 575 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 576 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 577 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 578 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 579 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 580 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 581 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 582 /* (strict upper triangular part of A)*x */ 583 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 584 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 585 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 586 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 587 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 588 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 589 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 590 v += 49; 591 } 592 xb +=7; ai++; 593 } 594 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 595 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 596 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 597 PetscFunctionReturn(0); 598 } 599 600 /* 601 This will not work with MatScalar == float because it calls the BLAS 602 */ 603 #undef __FUNCT__ 604 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 605 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 606 { 607 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 608 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 609 const PetscScalar *x,*x_ptr,*xb; 610 const MatScalar *v; 611 PetscErrorCode ierr; 612 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 613 const PetscInt *idx,*aj,*ii; 614 PetscInt nonzerorow=0; 615 616 PetscFunctionBegin; 617 ierr = VecSet(zz,zero);CHKERRQ(ierr); 618 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 619 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 620 621 aj = a->j; 622 v = a->a; 623 ii = a->i; 624 625 if (!a->mult_work) { 626 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 627 } 628 work = a->mult_work; 629 630 for (i=0; i<mbs; i++) { 631 n = ii[1] - ii[0]; ncols = n*bs; 632 workt = work; idx=aj+ii[0]; 633 nonzerorow += (n>0); 634 635 /* upper triangular part */ 636 for (j=0; j<n; j++) { 637 xb = x_ptr + bs*(*idx++); 638 for (k=0; k<bs; k++) workt[k] = xb[k]; 639 workt += bs; 640 } 641 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 642 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 643 644 /* strict lower triangular part */ 645 idx = aj+ii[0]; 646 if (*idx == i) { 647 ncols -= bs; v += bs2; idx++; n--; 648 } 649 650 if (ncols > 0) { 651 workt = work; 652 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 653 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 654 for (j=0; j<n; j++) { 655 zb = z_ptr + bs*(*idx++); 656 for (k=0; k<bs; k++) zb[k] += workt[k]; 657 workt += bs; 658 } 659 } 660 x += bs; v += n*bs2; z += bs; ii++; 661 } 662 663 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 664 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 665 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 666 PetscFunctionReturn(0); 667 } 668 669 #undef __FUNCT__ 670 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 671 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 672 { 673 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 674 PetscScalar *z,x1; 675 const PetscScalar *x,*xb; 676 const MatScalar *v; 677 PetscErrorCode ierr; 678 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 679 const PetscInt *aj=a->j,*ai=a->i,*ib; 680 PetscInt nonzerorow=0; 681 682 PetscFunctionBegin; 683 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 684 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 685 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 686 v = a->a; 687 xb = x; 688 689 for (i=0; i<mbs; i++) { 690 n = ai[1] - ai[0]; /* length of i_th row of A */ 691 x1 = xb[0]; 692 ib = aj + *ai; 693 jmin = 0; 694 nonzerorow += (n>0); 695 if (*ib == i) { /* (diag of A)*x */ 696 z[i] += *v++ * x[*ib++]; jmin++; 697 } 698 for (j=jmin; j<n; j++) { 699 cval = *ib; 700 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 701 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 702 } 703 xb++; ai++; 704 } 705 706 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 707 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 708 709 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 710 PetscFunctionReturn(0); 711 } 712 713 #undef __FUNCT__ 714 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 715 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 716 { 717 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 718 PetscScalar *z,x1,x2; 719 const PetscScalar *x,*xb; 720 const MatScalar *v; 721 PetscErrorCode ierr; 722 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 723 const PetscInt *aj=a->j,*ai=a->i,*ib; 724 PetscInt nonzerorow=0; 725 726 PetscFunctionBegin; 727 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 728 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 729 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 730 731 v = a->a; 732 xb = x; 733 734 for (i=0; i<mbs; i++) { 735 n = ai[1] - ai[0]; /* length of i_th block row of A */ 736 x1 = xb[0]; x2 = xb[1]; 737 ib = aj + *ai; 738 jmin = 0; 739 nonzerorow += (n>0); 740 if (*ib == i) { /* (diag of A)*x */ 741 z[2*i] += v[0]*x1 + v[2]*x2; 742 z[2*i+1] += v[2]*x1 + v[3]*x2; 743 v += 4; jmin++; 744 } 745 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 746 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 747 for (j=jmin; j<n; j++) { 748 /* (strict lower triangular part of A)*x */ 749 cval = ib[j]*2; 750 z[cval] += v[0]*x1 + v[1]*x2; 751 z[cval+1] += v[2]*x1 + v[3]*x2; 752 /* (strict upper triangular part of A)*x */ 753 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 754 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 755 v += 4; 756 } 757 xb +=2; ai++; 758 } 759 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 760 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 761 762 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 763 PetscFunctionReturn(0); 764 } 765 766 #undef __FUNCT__ 767 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 768 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 769 { 770 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 771 PetscScalar *z,x1,x2,x3; 772 const PetscScalar *x,*xb; 773 const MatScalar *v; 774 PetscErrorCode ierr; 775 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 776 const PetscInt *aj=a->j,*ai=a->i,*ib; 777 PetscInt nonzerorow=0; 778 779 PetscFunctionBegin; 780 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 781 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 782 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 783 784 v = a->a; 785 xb = x; 786 787 for (i=0; i<mbs; i++) { 788 n = ai[1] - ai[0]; /* length of i_th block row of A */ 789 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 790 ib = aj + *ai; 791 jmin = 0; 792 nonzerorow += (n>0); 793 if (*ib == i) { /* (diag of A)*x */ 794 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 795 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 796 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 797 v += 9; jmin++; 798 } 799 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 800 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 801 for (j=jmin; j<n; j++) { 802 /* (strict lower triangular part of A)*x */ 803 cval = ib[j]*3; 804 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 805 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 806 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 807 /* (strict upper triangular part of A)*x */ 808 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 809 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 810 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 811 v += 9; 812 } 813 xb +=3; ai++; 814 } 815 816 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 817 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 818 819 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 820 PetscFunctionReturn(0); 821 } 822 823 #undef __FUNCT__ 824 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 825 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 826 { 827 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 828 PetscScalar *z,x1,x2,x3,x4; 829 const PetscScalar *x,*xb; 830 const MatScalar *v; 831 PetscErrorCode ierr; 832 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 833 const PetscInt *aj=a->j,*ai=a->i,*ib; 834 PetscInt nonzerorow=0; 835 836 PetscFunctionBegin; 837 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 838 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 839 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 840 841 v = a->a; 842 xb = x; 843 844 for (i=0; i<mbs; i++) { 845 n = ai[1] - ai[0]; /* length of i_th block row of A */ 846 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 847 ib = aj + *ai; 848 jmin = 0; 849 nonzerorow += (n>0); 850 if (*ib == i) { /* (diag of A)*x */ 851 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 852 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 853 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 854 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 855 v += 16; jmin++; 856 } 857 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 858 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 859 for (j=jmin; j<n; j++) { 860 /* (strict lower triangular part of A)*x */ 861 cval = ib[j]*4; 862 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 863 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 864 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 865 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 866 /* (strict upper triangular part of A)*x */ 867 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 868 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 869 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 870 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 871 v += 16; 872 } 873 xb +=4; ai++; 874 } 875 876 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 877 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 878 879 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 880 PetscFunctionReturn(0); 881 } 882 883 #undef __FUNCT__ 884 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 885 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 886 { 887 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 888 PetscScalar *z,x1,x2,x3,x4,x5; 889 const PetscScalar *x,*xb; 890 const MatScalar *v; 891 PetscErrorCode ierr; 892 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 893 const PetscInt *aj=a->j,*ai=a->i,*ib; 894 PetscInt nonzerorow=0; 895 896 PetscFunctionBegin; 897 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 898 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 899 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 900 901 v = a->a; 902 xb = x; 903 904 for (i=0; i<mbs; i++) { 905 n = ai[1] - ai[0]; /* length of i_th block row of A */ 906 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 907 ib = aj + *ai; 908 jmin = 0; 909 nonzerorow += (n>0); 910 if (*ib == i) { /* (diag of A)*x */ 911 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 912 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 913 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 914 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 915 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 916 v += 25; jmin++; 917 } 918 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 919 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 920 for (j=jmin; j<n; j++) { 921 /* (strict lower triangular part of A)*x */ 922 cval = ib[j]*5; 923 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 924 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 925 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 926 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 927 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 928 /* (strict upper triangular part of A)*x */ 929 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 930 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 931 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 932 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 933 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 934 v += 25; 935 } 936 xb +=5; ai++; 937 } 938 939 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 940 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 941 942 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 943 PetscFunctionReturn(0); 944 } 945 #undef __FUNCT__ 946 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 947 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 948 { 949 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 950 PetscScalar *z,x1,x2,x3,x4,x5,x6; 951 const PetscScalar *x,*xb; 952 const MatScalar *v; 953 PetscErrorCode ierr; 954 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 955 const PetscInt *aj=a->j,*ai=a->i,*ib; 956 PetscInt nonzerorow=0; 957 958 PetscFunctionBegin; 959 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 960 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 961 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 962 963 v = a->a; 964 xb = x; 965 966 for (i=0; i<mbs; i++) { 967 n = ai[1] - ai[0]; /* length of i_th block row of A */ 968 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 969 ib = aj + *ai; 970 jmin = 0; 971 nonzerorow += (n>0); 972 if (*ib == i) { /* (diag of A)*x */ 973 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 974 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 975 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 976 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 977 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 978 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 979 v += 36; jmin++; 980 } 981 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 982 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 983 for (j=jmin; j<n; j++) { 984 /* (strict lower triangular part of A)*x */ 985 cval = ib[j]*6; 986 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 987 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 988 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 989 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 990 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 991 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 992 /* (strict upper triangular part of A)*x */ 993 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 994 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 995 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 996 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 997 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 998 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 999 v += 36; 1000 } 1001 xb +=6; ai++; 1002 } 1003 1004 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1005 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1006 1007 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1008 PetscFunctionReturn(0); 1009 } 1010 1011 #undef __FUNCT__ 1012 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1013 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1014 { 1015 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1016 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1017 const PetscScalar *x,*xb; 1018 const MatScalar *v; 1019 PetscErrorCode ierr; 1020 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1021 const PetscInt *aj=a->j,*ai=a->i,*ib; 1022 PetscInt nonzerorow=0; 1023 1024 PetscFunctionBegin; 1025 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1026 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1027 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1028 1029 v = a->a; 1030 xb = x; 1031 1032 for (i=0; i<mbs; i++) { 1033 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1034 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1035 ib = aj + *ai; 1036 jmin = 0; 1037 nonzerorow += (n>0); 1038 if (*ib == i) { /* (diag of A)*x */ 1039 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1040 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1041 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1042 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1043 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1044 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1045 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1046 v += 49; jmin++; 1047 } 1048 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1049 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1050 for (j=jmin; j<n; j++) { 1051 /* (strict lower triangular part of A)*x */ 1052 cval = ib[j]*7; 1053 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1054 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1055 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1056 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1057 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1058 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1059 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1060 /* (strict upper triangular part of A)*x */ 1061 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1062 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1063 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1064 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1065 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1066 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1067 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1068 v += 49; 1069 } 1070 xb +=7; ai++; 1071 } 1072 1073 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1074 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1075 1076 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1077 PetscFunctionReturn(0); 1078 } 1079 1080 #undef __FUNCT__ 1081 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1082 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1083 { 1084 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1085 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1086 const PetscScalar *x,*x_ptr,*xb; 1087 const MatScalar *v; 1088 PetscErrorCode ierr; 1089 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1090 const PetscInt *idx,*aj,*ii; 1091 PetscInt nonzerorow=0; 1092 1093 PetscFunctionBegin; 1094 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1095 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1096 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1097 1098 aj = a->j; 1099 v = a->a; 1100 ii = a->i; 1101 1102 if (!a->mult_work) { 1103 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1104 } 1105 work = a->mult_work; 1106 1107 1108 for (i=0; i<mbs; i++) { 1109 n = ii[1] - ii[0]; ncols = n*bs; 1110 workt = work; idx=aj+ii[0]; 1111 nonzerorow += (n>0); 1112 1113 /* upper triangular part */ 1114 for (j=0; j<n; j++) { 1115 xb = x_ptr + bs*(*idx++); 1116 for (k=0; k<bs; k++) workt[k] = xb[k]; 1117 workt += bs; 1118 } 1119 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1120 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1121 1122 /* strict lower triangular part */ 1123 idx = aj+ii[0]; 1124 if (*idx == i) { 1125 ncols -= bs; v += bs2; idx++; n--; 1126 } 1127 if (ncols > 0) { 1128 workt = work; 1129 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1130 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1131 for (j=0; j<n; j++) { 1132 zb = z_ptr + bs*(*idx++); 1133 for (k=0; k<bs; k++) zb[k] += workt[k]; 1134 workt += bs; 1135 } 1136 } 1137 1138 x += bs; v += n*bs2; z += bs; ii++; 1139 } 1140 1141 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1142 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1143 1144 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1145 PetscFunctionReturn(0); 1146 } 1147 1148 #undef __FUNCT__ 1149 #define __FUNCT__ "MatScale_SeqSBAIJ" 1150 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1151 { 1152 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1153 PetscScalar oalpha = alpha; 1154 PetscErrorCode ierr; 1155 PetscBLASInt one = 1,totalnz; 1156 1157 PetscFunctionBegin; 1158 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1159 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1160 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1161 PetscFunctionReturn(0); 1162 } 1163 1164 #undef __FUNCT__ 1165 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1166 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1167 { 1168 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1169 const MatScalar *v = a->a; 1170 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1171 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1172 PetscErrorCode ierr; 1173 const PetscInt *aj=a->j,*col; 1174 1175 PetscFunctionBegin; 1176 if (type == NORM_FROBENIUS) { 1177 for (k=0; k<mbs; k++) { 1178 jmin = a->i[k]; jmax = a->i[k+1]; 1179 col = aj + jmin; 1180 if (*col == k) { /* diagonal block */ 1181 for (i=0; i<bs2; i++) { 1182 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1183 } 1184 jmin++; 1185 } 1186 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1187 for (i=0; i<bs2; i++) { 1188 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1189 } 1190 } 1191 } 1192 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1193 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1194 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1195 for (i=0; i<mbs; i++) jl[i] = mbs; 1196 il[0] = 0; 1197 1198 *norm = 0.0; 1199 for (k=0; k<mbs; k++) { /* k_th block row */ 1200 for (j=0; j<bs; j++) sum[j]=0.0; 1201 /*-- col sum --*/ 1202 i = jl[k]; /* first |A(i,k)| to be added */ 1203 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1204 at step k */ 1205 while (i<mbs) { 1206 nexti = jl[i]; /* next block row to be added */ 1207 ik = il[i]; /* block index of A(i,k) in the array a */ 1208 for (j=0; j<bs; j++) { 1209 v = a->a + ik*bs2 + j*bs; 1210 for (k1=0; k1<bs; k1++) { 1211 sum[j] += PetscAbsScalar(*v); v++; 1212 } 1213 } 1214 /* update il, jl */ 1215 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1216 jmax = a->i[i+1]; 1217 if (jmin < jmax) { 1218 il[i] = jmin; 1219 j = a->j[jmin]; 1220 jl[i] = jl[j]; jl[j]=i; 1221 } 1222 i = nexti; 1223 } 1224 /*-- row sum --*/ 1225 jmin = a->i[k]; jmax = a->i[k+1]; 1226 for (i=jmin; i<jmax; i++) { 1227 for (j=0; j<bs; j++) { 1228 v = a->a + i*bs2 + j; 1229 for (k1=0; k1<bs; k1++) { 1230 sum[j] += PetscAbsScalar(*v); v += bs; 1231 } 1232 } 1233 } 1234 /* add k_th block row to il, jl */ 1235 col = aj+jmin; 1236 if (*col == k) jmin++; 1237 if (jmin < jmax) { 1238 il[k] = jmin; 1239 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1240 } 1241 for (j=0; j<bs; j++) { 1242 if (sum[j] > *norm) *norm = sum[j]; 1243 } 1244 } 1245 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1246 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1247 PetscFunctionReturn(0); 1248 } 1249 1250 #undef __FUNCT__ 1251 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1252 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1253 { 1254 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1255 PetscErrorCode ierr; 1256 1257 PetscFunctionBegin; 1258 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1259 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1260 *flg = PETSC_FALSE; 1261 PetscFunctionReturn(0); 1262 } 1263 1264 /* if the a->i are the same */ 1265 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1266 if (!*flg) PetscFunctionReturn(0); 1267 1268 /* if a->j are the same */ 1269 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1270 if (!*flg) PetscFunctionReturn(0); 1271 1272 /* if a->a are the same */ 1273 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1274 PetscFunctionReturn(0); 1275 } 1276 1277 #undef __FUNCT__ 1278 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1279 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1280 { 1281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1282 PetscErrorCode ierr; 1283 PetscInt i,j,k,row,bs,ambs,bs2; 1284 const PetscInt *ai,*aj; 1285 PetscScalar *x,zero = 0.0; 1286 const MatScalar *aa,*aa_j; 1287 1288 PetscFunctionBegin; 1289 bs = A->rmap->bs; 1290 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1291 1292 aa = a->a; 1293 ambs = a->mbs; 1294 1295 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1296 PetscInt *diag=a->diag; 1297 aa = a->a; 1298 ambs = a->mbs; 1299 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1300 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1301 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1302 PetscFunctionReturn(0); 1303 } 1304 1305 ai = a->i; 1306 aj = a->j; 1307 bs2 = a->bs2; 1308 ierr = VecSet(v,zero);CHKERRQ(ierr); 1309 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1310 for (i=0; i<ambs; i++) { 1311 j=ai[i]; 1312 if (aj[j] == i) { /* if this is a diagonal element */ 1313 row = i*bs; 1314 aa_j = aa + j*bs2; 1315 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1316 } 1317 } 1318 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1319 PetscFunctionReturn(0); 1320 } 1321 1322 #undef __FUNCT__ 1323 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1324 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1325 { 1326 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1327 PetscScalar x; 1328 const PetscScalar *l,*li,*ri; 1329 MatScalar *aa,*v; 1330 PetscErrorCode ierr; 1331 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1332 PetscBool flg; 1333 1334 PetscFunctionBegin; 1335 if (ll != rr) { 1336 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1337 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1338 } 1339 if (!ll) PetscFunctionReturn(0); 1340 ai = a->i; 1341 aj = a->j; 1342 aa = a->a; 1343 m = A->rmap->N; 1344 bs = A->rmap->bs; 1345 mbs = a->mbs; 1346 bs2 = a->bs2; 1347 1348 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1349 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1350 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1351 for (i=0; i<mbs; i++) { /* for each block row */ 1352 M = ai[i+1] - ai[i]; 1353 li = l + i*bs; 1354 v = aa + bs2*ai[i]; 1355 for (j=0; j<M; j++) { /* for each block */ 1356 ri = l + bs*aj[ai[i]+j]; 1357 for (k=0; k<bs; k++) { 1358 x = ri[k]; 1359 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1360 } 1361 } 1362 } 1363 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1364 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1365 PetscFunctionReturn(0); 1366 } 1367 1368 #undef __FUNCT__ 1369 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1370 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1371 { 1372 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1373 1374 PetscFunctionBegin; 1375 info->block_size = a->bs2; 1376 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1377 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1378 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1379 info->assemblies = A->num_ass; 1380 info->mallocs = A->info.mallocs; 1381 info->memory = ((PetscObject)A)->mem; 1382 if (A->factortype) { 1383 info->fill_ratio_given = A->info.fill_ratio_given; 1384 info->fill_ratio_needed = A->info.fill_ratio_needed; 1385 info->factor_mallocs = A->info.factor_mallocs; 1386 } else { 1387 info->fill_ratio_given = 0; 1388 info->fill_ratio_needed = 0; 1389 info->factor_mallocs = 0; 1390 } 1391 PetscFunctionReturn(0); 1392 } 1393 1394 1395 #undef __FUNCT__ 1396 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1397 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1398 { 1399 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1400 PetscErrorCode ierr; 1401 1402 PetscFunctionBegin; 1403 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1404 PetscFunctionReturn(0); 1405 } 1406 1407 #undef __FUNCT__ 1408 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1409 /* 1410 This code does not work since it only checks the upper triangular part of 1411 the matrix. Hence it is not listed in the function table. 1412 */ 1413 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1414 { 1415 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1416 PetscErrorCode ierr; 1417 PetscInt i,j,n,row,col,bs,mbs; 1418 const PetscInt *ai,*aj; 1419 PetscReal atmp; 1420 const MatScalar *aa; 1421 PetscScalar *x; 1422 PetscInt ncols,brow,bcol,krow,kcol; 1423 1424 PetscFunctionBegin; 1425 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1426 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1427 bs = A->rmap->bs; 1428 aa = a->a; 1429 ai = a->i; 1430 aj = a->j; 1431 mbs = a->mbs; 1432 1433 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1434 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1435 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1436 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1437 for (i=0; i<mbs; i++) { 1438 ncols = ai[1] - ai[0]; ai++; 1439 brow = bs*i; 1440 for (j=0; j<ncols; j++) { 1441 bcol = bs*(*aj); 1442 for (kcol=0; kcol<bs; kcol++) { 1443 col = bcol + kcol; /* col index */ 1444 for (krow=0; krow<bs; krow++) { 1445 atmp = PetscAbsScalar(*aa); aa++; 1446 row = brow + krow; /* row index */ 1447 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1448 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1449 } 1450 } 1451 aj++; 1452 } 1453 } 1454 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1455 PetscFunctionReturn(0); 1456 } 1457