1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table_out,table_in; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j) { 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if (!PetscBTLookupSet(table_out,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap */ 52 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++) { 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++) { 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table_out,bcol)) { 63 nidx[isz++] = bcol; 64 if (bcol_max < bcol) bcol_max = bcol; 65 } 66 } 67 k++; 68 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70 for (l = start; l<end ; l++) { 71 bcol = aj[l]; 72 if (bcol > bcol_max) break; 73 if (PetscBTLookup(table_in,bcol)) { 74 if (!PetscBTLookupSet(table_out,brow)) {nidx[isz++] = brow;} 75 break; /* for l = start; l<end ; l++) */ 76 } 77 } 78 } 79 } 80 } /* for each overlap */ 81 82 /* expand the Index Set */ 83 for (j=0; j<isz; j++) { 84 for (k=0; k<bs; k++) 85 nidx2[j*bs+k] = nidx[j]*bs+k; 86 } 87 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 88 } 89 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 90 ierr = PetscFree(nidx);CHKERRQ(ierr); 91 ierr = PetscFree(nidx2);CHKERRQ(ierr); 92 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 93 PetscFunctionReturn(0); 94 } 95 96 #undef __FUNCT__ 97 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 98 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 99 { 100 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 101 PetscErrorCode ierr; 102 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 103 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 104 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 105 const PetscInt *irow,*aj = a->j,*ai = a->i; 106 MatScalar *mat_a; 107 Mat C; 108 PetscBool flag,sorted; 109 110 PetscFunctionBegin; 111 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 112 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 113 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 114 115 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 116 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 117 118 ierr = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr); 119 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 120 ssmap = smap; 121 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 122 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 123 /* determine lens of each row */ 124 for (i=0; i<nrows; i++) { 125 kstart = ai[irow[i]]; 126 kend = kstart + a->ilen[irow[i]]; 127 lens[i] = 0; 128 for (k=kstart; k<kend; k++) { 129 if (ssmap[aj[k]]) { 130 lens[i]++; 131 } 132 } 133 } 134 /* Create and fill new matrix */ 135 if (scall == MAT_REUSE_MATRIX) { 136 c = (Mat_SeqSBAIJ *)((*B)->data); 137 138 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 139 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 140 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 141 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 142 C = *B; 143 } else { 144 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 145 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 146 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 147 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 148 } 149 c = (Mat_SeqSBAIJ *)(C->data); 150 for (i=0; i<nrows; i++) { 151 row = irow[i]; 152 kstart = ai[row]; 153 kend = kstart + a->ilen[row]; 154 mat_i = c->i[i]; 155 mat_j = c->j + mat_i; 156 mat_a = c->a + mat_i*bs2; 157 mat_ilen = c->ilen + i; 158 for (k=kstart; k<kend; k++) { 159 if ((tcol=ssmap[a->j[k]])) { 160 *mat_j++ = tcol - 1; 161 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 162 mat_a += bs2; 163 (*mat_ilen)++; 164 } 165 } 166 } 167 168 /* Free work space */ 169 ierr = PetscFree(smap);CHKERRQ(ierr); 170 ierr = PetscFree(lens);CHKERRQ(ierr); 171 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 173 174 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 175 *B = C; 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 182 { 183 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 184 IS is1; 185 PetscErrorCode ierr; 186 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 187 const PetscInt *irow; 188 189 PetscFunctionBegin; 190 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 191 192 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 193 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 194 195 /* Verify if the indices corespond to each element in a block 196 and form the IS with compressed IS */ 197 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 198 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 199 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 200 201 count = 0; 202 for (i=0; i<a->mbs; i++) { 203 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 204 if (vary[i]==bs) iary[count++] = i; 205 } 206 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 207 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 208 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 209 210 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 211 ierr = ISDestroy(&is1);CHKERRQ(ierr); 212 PetscFunctionReturn(0); 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 218 { 219 PetscErrorCode ierr; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (scall == MAT_INITIAL_MATRIX) { 224 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 225 } 226 227 for (i=0; i<n; i++) { 228 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 /* -------------------------------------------------------*/ 234 /* Should check that shapes of vectors and matrices match */ 235 /* -------------------------------------------------------*/ 236 237 #undef __FUNCT__ 238 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 239 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 240 { 241 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 242 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 243 MatScalar *v; 244 PetscErrorCode ierr; 245 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 246 PetscInt nonzerorow=0; 247 248 PetscFunctionBegin; 249 ierr = VecSet(zz,zero);CHKERRQ(ierr); 250 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 251 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 252 253 v = a->a; 254 xb = x; 255 256 for (i=0; i<mbs; i++) { 257 n = ai[1] - ai[0]; /* length of i_th block row of A */ 258 x1 = xb[0]; x2 = xb[1]; 259 ib = aj + *ai; 260 jmin = 0; 261 nonzerorow += (n>0); 262 if (*ib == i) { /* (diag of A)*x */ 263 z[2*i] += v[0]*x1 + v[2]*x2; 264 z[2*i+1] += v[2]*x1 + v[3]*x2; 265 v += 4; jmin++; 266 } 267 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 268 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 269 for (j=jmin; j<n; j++) { 270 /* (strict lower triangular part of A)*x */ 271 cval = ib[j]*2; 272 z[cval] += v[0]*x1 + v[1]*x2; 273 z[cval+1] += v[2]*x1 + v[3]*x2; 274 /* (strict upper triangular part of A)*x */ 275 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 276 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 277 v += 4; 278 } 279 xb +=2; ai++; 280 } 281 282 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 283 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 284 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 285 PetscFunctionReturn(0); 286 } 287 288 #undef __FUNCT__ 289 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 290 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 291 { 292 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 293 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 294 MatScalar *v; 295 PetscErrorCode ierr; 296 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 297 PetscInt nonzerorow=0; 298 299 PetscFunctionBegin; 300 ierr = VecSet(zz,zero);CHKERRQ(ierr); 301 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 302 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 303 304 v = a->a; 305 xb = x; 306 307 for (i=0; i<mbs; i++) { 308 n = ai[1] - ai[0]; /* length of i_th block row of A */ 309 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 310 ib = aj + *ai; 311 jmin = 0; 312 nonzerorow += (n>0); 313 if (*ib == i) { /* (diag of A)*x */ 314 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 315 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 316 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 317 v += 9; jmin++; 318 } 319 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 320 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 321 for (j=jmin; j<n; j++) { 322 /* (strict lower triangular part of A)*x */ 323 cval = ib[j]*3; 324 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 325 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 326 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 327 /* (strict upper triangular part of A)*x */ 328 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 329 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 330 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 331 v += 9; 332 } 333 xb +=3; ai++; 334 } 335 336 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 337 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 338 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 339 PetscFunctionReturn(0); 340 } 341 342 #undef __FUNCT__ 343 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 344 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 345 { 346 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 347 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 348 MatScalar *v; 349 PetscErrorCode ierr; 350 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 351 PetscInt nonzerorow=0; 352 353 PetscFunctionBegin; 354 ierr = VecSet(zz,zero);CHKERRQ(ierr); 355 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 356 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 357 358 v = a->a; 359 xb = x; 360 361 for (i=0; i<mbs; i++) { 362 n = ai[1] - ai[0]; /* length of i_th block row of A */ 363 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 364 ib = aj + *ai; 365 jmin = 0; 366 nonzerorow += (n>0); 367 if (*ib == i) { /* (diag of A)*x */ 368 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 369 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 370 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 371 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 372 v += 16; jmin++; 373 } 374 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 375 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 376 for (j=jmin; j<n; j++) { 377 /* (strict lower triangular part of A)*x */ 378 cval = ib[j]*4; 379 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 380 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 381 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 382 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 383 /* (strict upper triangular part of A)*x */ 384 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 385 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 386 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 387 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 388 v += 16; 389 } 390 xb +=4; ai++; 391 } 392 393 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 394 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 395 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 396 PetscFunctionReturn(0); 397 } 398 399 #undef __FUNCT__ 400 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 401 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 402 { 403 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 404 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 405 MatScalar *v; 406 PetscErrorCode ierr; 407 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 408 PetscInt nonzerorow=0; 409 410 PetscFunctionBegin; 411 ierr = VecSet(zz,zero);CHKERRQ(ierr); 412 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 413 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 414 415 v = a->a; 416 xb = x; 417 418 for (i=0; i<mbs; i++) { 419 n = ai[1] - ai[0]; /* length of i_th block row of A */ 420 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 421 ib = aj + *ai; 422 jmin = 0; 423 nonzerorow += (n>0); 424 if (*ib == i) { /* (diag of A)*x */ 425 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 426 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 427 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 428 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 429 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 430 v += 25; jmin++; 431 } 432 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 433 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 434 for (j=jmin; j<n; j++) { 435 /* (strict lower triangular part of A)*x */ 436 cval = ib[j]*5; 437 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 438 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 439 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 440 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 441 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 442 /* (strict upper triangular part of A)*x */ 443 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 444 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 445 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 446 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 447 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 448 v += 25; 449 } 450 xb +=5; ai++; 451 } 452 453 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 454 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 455 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 456 PetscFunctionReturn(0); 457 } 458 459 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 462 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 463 { 464 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 465 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 466 MatScalar *v; 467 PetscErrorCode ierr; 468 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 469 PetscInt nonzerorow=0; 470 471 PetscFunctionBegin; 472 ierr = VecSet(zz,zero);CHKERRQ(ierr); 473 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 474 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 475 476 v = a->a; 477 xb = x; 478 479 for (i=0; i<mbs; i++) { 480 n = ai[1] - ai[0]; /* length of i_th block row of A */ 481 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 482 ib = aj + *ai; 483 jmin = 0; 484 nonzerorow += (n>0); 485 if (*ib == i) { /* (diag of A)*x */ 486 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 487 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 488 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 489 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 490 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 491 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 492 v += 36; jmin++; 493 } 494 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 495 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 496 for (j=jmin; j<n; j++) { 497 /* (strict lower triangular part of A)*x */ 498 cval = ib[j]*6; 499 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 500 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 501 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 502 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 503 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 504 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 505 /* (strict upper triangular part of A)*x */ 506 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 507 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 508 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 509 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 510 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 511 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 512 v += 36; 513 } 514 xb +=6; ai++; 515 } 516 517 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 518 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 519 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 520 PetscFunctionReturn(0); 521 } 522 #undef __FUNCT__ 523 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 524 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 525 { 526 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 527 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 528 MatScalar *v; 529 PetscErrorCode ierr; 530 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 531 PetscInt nonzerorow=0; 532 533 PetscFunctionBegin; 534 ierr = VecSet(zz,zero);CHKERRQ(ierr); 535 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 536 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 537 538 v = a->a; 539 xb = x; 540 541 for (i=0; i<mbs; i++) { 542 n = ai[1] - ai[0]; /* length of i_th block row of A */ 543 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 544 ib = aj + *ai; 545 jmin = 0; 546 nonzerorow += (n>0); 547 if (*ib == i) { /* (diag of A)*x */ 548 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 549 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 550 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 551 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 552 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 553 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 554 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 555 v += 49; jmin++; 556 } 557 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 558 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 559 for (j=jmin; j<n; j++) { 560 /* (strict lower triangular part of A)*x */ 561 cval = ib[j]*7; 562 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 563 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 564 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 565 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 566 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 567 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 568 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 569 /* (strict upper triangular part of A)*x */ 570 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 571 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 572 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 573 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 574 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 575 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 576 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 577 v += 49; 578 } 579 xb +=7; ai++; 580 } 581 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 582 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 583 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 584 PetscFunctionReturn(0); 585 } 586 587 /* 588 This will not work with MatScalar == float because it calls the BLAS 589 */ 590 #undef __FUNCT__ 591 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 592 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 593 { 594 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 595 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 596 MatScalar *v; 597 PetscErrorCode ierr; 598 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 599 PetscInt nonzerorow=0; 600 601 PetscFunctionBegin; 602 ierr = VecSet(zz,zero);CHKERRQ(ierr); 603 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 605 606 aj = a->j; 607 v = a->a; 608 ii = a->i; 609 610 if (!a->mult_work) { 611 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 612 } 613 work = a->mult_work; 614 615 for (i=0; i<mbs; i++) { 616 n = ii[1] - ii[0]; ncols = n*bs; 617 workt = work; idx=aj+ii[0]; 618 nonzerorow += (n>0); 619 620 /* upper triangular part */ 621 for (j=0; j<n; j++) { 622 xb = x_ptr + bs*(*idx++); 623 for (k=0; k<bs; k++) workt[k] = xb[k]; 624 workt += bs; 625 } 626 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 627 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 628 629 /* strict lower triangular part */ 630 idx = aj+ii[0]; 631 if (*idx == i) { 632 ncols -= bs; v += bs2; idx++; n--; 633 } 634 635 if (ncols > 0) { 636 workt = work; 637 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 638 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 639 for (j=0; j<n; j++) { 640 zb = z_ptr + bs*(*idx++); 641 for (k=0; k<bs; k++) zb[k] += workt[k] ; 642 workt += bs; 643 } 644 } 645 x += bs; v += n*bs2; z += bs; ii++; 646 } 647 648 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 649 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 650 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 651 PetscFunctionReturn(0); 652 } 653 654 #undef __FUNCT__ 655 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 656 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 657 { 658 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 659 PetscScalar *x,*z,*xb,x1; 660 MatScalar *v; 661 PetscErrorCode ierr; 662 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 663 PetscInt nonzerorow=0; 664 665 PetscFunctionBegin; 666 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 667 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 668 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 669 v = a->a; 670 xb = x; 671 672 for (i=0; i<mbs; i++) { 673 n = ai[1] - ai[0]; /* length of i_th row of A */ 674 x1 = xb[0]; 675 ib = aj + *ai; 676 jmin = 0; 677 nonzerorow += (n>0); 678 if (*ib == i) { /* (diag of A)*x */ 679 z[i] += *v++ * x[*ib++]; jmin++; 680 } 681 for (j=jmin; j<n; j++) { 682 cval = *ib; 683 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 684 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 685 } 686 xb++; ai++; 687 } 688 689 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 690 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 691 692 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 693 PetscFunctionReturn(0); 694 } 695 696 #undef __FUNCT__ 697 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 698 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 699 { 700 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 701 PetscScalar *x,*z,*xb,x1,x2; 702 MatScalar *v; 703 PetscErrorCode ierr; 704 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 705 PetscInt nonzerorow=0; 706 707 PetscFunctionBegin; 708 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 709 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 710 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 711 712 v = a->a; 713 xb = x; 714 715 for (i=0; i<mbs; i++) { 716 n = ai[1] - ai[0]; /* length of i_th block row of A */ 717 x1 = xb[0]; x2 = xb[1]; 718 ib = aj + *ai; 719 jmin = 0; 720 nonzerorow += (n>0); 721 if (*ib == i) { /* (diag of A)*x */ 722 z[2*i] += v[0]*x1 + v[2]*x2; 723 z[2*i+1] += v[2]*x1 + v[3]*x2; 724 v += 4; jmin++; 725 } 726 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 727 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 728 for (j=jmin; j<n; j++) { 729 /* (strict lower triangular part of A)*x */ 730 cval = ib[j]*2; 731 z[cval] += v[0]*x1 + v[1]*x2; 732 z[cval+1] += v[2]*x1 + v[3]*x2; 733 /* (strict upper triangular part of A)*x */ 734 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 735 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 736 v += 4; 737 } 738 xb +=2; ai++; 739 } 740 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 741 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 742 743 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 744 PetscFunctionReturn(0); 745 } 746 747 #undef __FUNCT__ 748 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 749 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 750 { 751 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 752 PetscScalar *x,*z,*xb,x1,x2,x3; 753 MatScalar *v; 754 PetscErrorCode ierr; 755 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 756 PetscInt nonzerorow=0; 757 758 PetscFunctionBegin; 759 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 760 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 761 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 762 763 v = a->a; 764 xb = x; 765 766 for (i=0; i<mbs; i++) { 767 n = ai[1] - ai[0]; /* length of i_th block row of A */ 768 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 769 ib = aj + *ai; 770 jmin = 0; 771 nonzerorow += (n>0); 772 if (*ib == i) { /* (diag of A)*x */ 773 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 774 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 775 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 776 v += 9; jmin++; 777 } 778 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 779 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 780 for (j=jmin; j<n; j++) { 781 /* (strict lower triangular part of A)*x */ 782 cval = ib[j]*3; 783 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 784 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 785 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 786 /* (strict upper triangular part of A)*x */ 787 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 788 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 789 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 790 v += 9; 791 } 792 xb +=3; ai++; 793 } 794 795 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 796 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 797 798 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 799 PetscFunctionReturn(0); 800 } 801 802 #undef __FUNCT__ 803 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 804 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 805 { 806 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 807 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 808 MatScalar *v; 809 PetscErrorCode ierr; 810 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 811 PetscInt nonzerorow=0; 812 813 PetscFunctionBegin; 814 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 815 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 816 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 817 818 v = a->a; 819 xb = x; 820 821 for (i=0; i<mbs; i++) { 822 n = ai[1] - ai[0]; /* length of i_th block row of A */ 823 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 824 ib = aj + *ai; 825 jmin = 0; 826 nonzerorow += (n>0); 827 if (*ib == i) { /* (diag of A)*x */ 828 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 829 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 830 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 831 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 832 v += 16; jmin++; 833 } 834 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 835 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 836 for (j=jmin; j<n; j++) { 837 /* (strict lower triangular part of A)*x */ 838 cval = ib[j]*4; 839 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 840 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 841 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 842 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 843 /* (strict upper triangular part of A)*x */ 844 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 845 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 846 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 847 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 848 v += 16; 849 } 850 xb +=4; ai++; 851 } 852 853 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 854 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 855 856 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 857 PetscFunctionReturn(0); 858 } 859 860 #undef __FUNCT__ 861 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 862 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 863 { 864 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 865 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 866 MatScalar *v; 867 PetscErrorCode ierr; 868 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 869 PetscInt nonzerorow=0; 870 871 PetscFunctionBegin; 872 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 873 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 874 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 875 876 v = a->a; 877 xb = x; 878 879 for (i=0; i<mbs; i++) { 880 n = ai[1] - ai[0]; /* length of i_th block row of A */ 881 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 882 ib = aj + *ai; 883 jmin = 0; 884 nonzerorow += (n>0); 885 if (*ib == i) { /* (diag of A)*x */ 886 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 887 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 888 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 889 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 890 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 891 v += 25; jmin++; 892 } 893 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 894 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 895 for (j=jmin; j<n; j++) { 896 /* (strict lower triangular part of A)*x */ 897 cval = ib[j]*5; 898 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 899 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 900 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 901 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 902 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 903 /* (strict upper triangular part of A)*x */ 904 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 905 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 906 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 907 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 908 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 909 v += 25; 910 } 911 xb +=5; ai++; 912 } 913 914 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 915 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 916 917 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 918 PetscFunctionReturn(0); 919 } 920 #undef __FUNCT__ 921 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 922 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 923 { 924 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 925 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 926 MatScalar *v; 927 PetscErrorCode ierr; 928 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 929 PetscInt nonzerorow=0; 930 931 PetscFunctionBegin; 932 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 933 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 934 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 935 936 v = a->a; 937 xb = x; 938 939 for (i=0; i<mbs; i++) { 940 n = ai[1] - ai[0]; /* length of i_th block row of A */ 941 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 942 ib = aj + *ai; 943 jmin = 0; 944 nonzerorow += (n>0); 945 if (*ib == i) { /* (diag of A)*x */ 946 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 947 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 948 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 949 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 950 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 951 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 952 v += 36; jmin++; 953 } 954 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 955 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 956 for (j=jmin; j<n; j++) { 957 /* (strict lower triangular part of A)*x */ 958 cval = ib[j]*6; 959 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 960 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 961 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 962 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 963 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 964 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 965 /* (strict upper triangular part of A)*x */ 966 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 967 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 968 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 969 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 970 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 971 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 972 v += 36; 973 } 974 xb +=6; ai++; 975 } 976 977 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 978 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 979 980 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 981 PetscFunctionReturn(0); 982 } 983 984 #undef __FUNCT__ 985 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 986 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 987 { 988 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 989 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 990 MatScalar *v; 991 PetscErrorCode ierr; 992 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 993 PetscInt nonzerorow=0; 994 995 PetscFunctionBegin; 996 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 997 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 998 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 999 1000 v = a->a; 1001 xb = x; 1002 1003 for (i=0; i<mbs; i++) { 1004 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1005 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1006 ib = aj + *ai; 1007 jmin = 0; 1008 nonzerorow += (n>0); 1009 if (*ib == i) { /* (diag of A)*x */ 1010 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1011 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1012 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1013 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1014 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1015 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1016 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1017 v += 49; jmin++; 1018 } 1019 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1020 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1021 for (j=jmin; j<n; j++) { 1022 /* (strict lower triangular part of A)*x */ 1023 cval = ib[j]*7; 1024 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1025 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1026 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1027 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1028 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1029 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1030 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1031 /* (strict upper triangular part of A)*x */ 1032 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1033 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1034 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1035 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1036 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1037 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1038 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1039 v += 49; 1040 } 1041 xb +=7; ai++; 1042 } 1043 1044 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1045 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1046 1047 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1048 PetscFunctionReturn(0); 1049 } 1050 1051 #undef __FUNCT__ 1052 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1053 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1054 { 1055 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1056 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1057 MatScalar *v; 1058 PetscErrorCode ierr; 1059 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1060 PetscInt nonzerorow=0; 1061 1062 PetscFunctionBegin; 1063 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1064 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1065 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1066 1067 aj = a->j; 1068 v = a->a; 1069 ii = a->i; 1070 1071 if (!a->mult_work) { 1072 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1073 } 1074 work = a->mult_work; 1075 1076 1077 for (i=0; i<mbs; i++) { 1078 n = ii[1] - ii[0]; ncols = n*bs; 1079 workt = work; idx=aj+ii[0]; 1080 nonzerorow += (n>0); 1081 1082 /* upper triangular part */ 1083 for (j=0; j<n; j++) { 1084 xb = x_ptr + bs*(*idx++); 1085 for (k=0; k<bs; k++) workt[k] = xb[k]; 1086 workt += bs; 1087 } 1088 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1089 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1090 1091 /* strict lower triangular part */ 1092 idx = aj+ii[0]; 1093 if (*idx == i) { 1094 ncols -= bs; v += bs2; idx++; n--; 1095 } 1096 if (ncols > 0) { 1097 workt = work; 1098 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1099 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1100 for (j=0; j<n; j++) { 1101 zb = z_ptr + bs*(*idx++); 1102 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1103 workt += bs; 1104 } 1105 } 1106 1107 x += bs; v += n*bs2; z += bs; ii++; 1108 } 1109 1110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1111 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1112 1113 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1114 PetscFunctionReturn(0); 1115 } 1116 1117 #undef __FUNCT__ 1118 #define __FUNCT__ "MatScale_SeqSBAIJ" 1119 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1120 { 1121 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1122 PetscScalar oalpha = alpha; 1123 PetscErrorCode ierr; 1124 PetscBLASInt one = 1,totalnz; 1125 1126 PetscFunctionBegin; 1127 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1128 BLASscal_(&totalnz,&oalpha,a->a,&one); 1129 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1130 PetscFunctionReturn(0); 1131 } 1132 1133 #undef __FUNCT__ 1134 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1135 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1136 { 1137 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1138 MatScalar *v = a->a; 1139 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1140 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1141 PetscErrorCode ierr; 1142 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1143 1144 PetscFunctionBegin; 1145 if (type == NORM_FROBENIUS) { 1146 for (k=0; k<mbs; k++) { 1147 jmin = a->i[k]; jmax = a->i[k+1]; 1148 col = aj + jmin; 1149 if (*col == k) { /* diagonal block */ 1150 for (i=0; i<bs2; i++) { 1151 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1152 } 1153 jmin++; 1154 } 1155 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1156 for (i=0; i<bs2; i++) { 1157 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1158 } 1159 } 1160 } 1161 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1162 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1163 ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr); 1164 for (i=0; i<mbs; i++) jl[i] = mbs; 1165 il[0] = 0; 1166 1167 *norm = 0.0; 1168 for (k=0; k<mbs; k++) { /* k_th block row */ 1169 for (j=0; j<bs; j++) sum[j]=0.0; 1170 /*-- col sum --*/ 1171 i = jl[k]; /* first |A(i,k)| to be added */ 1172 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1173 at step k */ 1174 while (i<mbs) { 1175 nexti = jl[i]; /* next block row to be added */ 1176 ik = il[i]; /* block index of A(i,k) in the array a */ 1177 for (j=0; j<bs; j++) { 1178 v = a->a + ik*bs2 + j*bs; 1179 for (k1=0; k1<bs; k1++) { 1180 sum[j] += PetscAbsScalar(*v); v++; 1181 } 1182 } 1183 /* update il, jl */ 1184 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1185 jmax = a->i[i+1]; 1186 if (jmin < jmax) { 1187 il[i] = jmin; 1188 j = a->j[jmin]; 1189 jl[i] = jl[j]; jl[j]=i; 1190 } 1191 i = nexti; 1192 } 1193 /*-- row sum --*/ 1194 jmin = a->i[k]; jmax = a->i[k+1]; 1195 for (i=jmin; i<jmax; i++) { 1196 for (j=0; j<bs; j++) { 1197 v = a->a + i*bs2 + j; 1198 for (k1=0; k1<bs; k1++) { 1199 sum[j] += PetscAbsScalar(*v); v += bs; 1200 } 1201 } 1202 } 1203 /* add k_th block row to il, jl */ 1204 col = aj+jmin; 1205 if (*col == k) jmin++; 1206 if (jmin < jmax) { 1207 il[k] = jmin; 1208 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1209 } 1210 for (j=0; j<bs; j++) { 1211 if (sum[j] > *norm) *norm = sum[j]; 1212 } 1213 } 1214 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1215 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1216 PetscFunctionReturn(0); 1217 } 1218 1219 #undef __FUNCT__ 1220 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1221 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1222 { 1223 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1224 PetscErrorCode ierr; 1225 1226 PetscFunctionBegin; 1227 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1228 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1229 *flg = PETSC_FALSE; 1230 PetscFunctionReturn(0); 1231 } 1232 1233 /* if the a->i are the same */ 1234 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1235 if (!*flg) { 1236 PetscFunctionReturn(0); 1237 } 1238 1239 /* if a->j are the same */ 1240 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1241 if (!*flg) { 1242 PetscFunctionReturn(0); 1243 } 1244 /* if a->a are the same */ 1245 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1246 PetscFunctionReturn(0); 1247 } 1248 1249 #undef __FUNCT__ 1250 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1251 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1252 { 1253 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1254 PetscErrorCode ierr; 1255 PetscInt i,j,k,row,bs,*ai,*aj,ambs,bs2; 1256 PetscScalar *x,zero = 0.0; 1257 MatScalar *aa,*aa_j; 1258 1259 PetscFunctionBegin; 1260 bs = A->rmap->bs; 1261 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1262 1263 aa = a->a; 1264 ambs = a->mbs; 1265 1266 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1267 PetscInt *diag=a->diag; 1268 aa = a->a; 1269 ambs = a->mbs; 1270 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1271 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1272 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1273 PetscFunctionReturn(0); 1274 } 1275 1276 ai = a->i; 1277 aj = a->j; 1278 bs2 = a->bs2; 1279 ierr = VecSet(v,zero);CHKERRQ(ierr); 1280 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1281 for (i=0; i<ambs; i++) { 1282 j=ai[i]; 1283 if (aj[j] == i) { /* if this is a diagonal element */ 1284 row = i*bs; 1285 aa_j = aa + j*bs2; 1286 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1287 } 1288 } 1289 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1290 PetscFunctionReturn(0); 1291 } 1292 1293 #undef __FUNCT__ 1294 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1295 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1296 { 1297 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1298 PetscScalar *l,x,*li,*ri; 1299 MatScalar *aa,*v; 1300 PetscErrorCode ierr; 1301 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1302 PetscBool flg; 1303 1304 PetscFunctionBegin; 1305 if (ll != rr) { 1306 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1307 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1308 } 1309 if (!ll) PetscFunctionReturn(0); 1310 ai = a->i; 1311 aj = a->j; 1312 aa = a->a; 1313 m = A->rmap->N; 1314 bs = A->rmap->bs; 1315 mbs = a->mbs; 1316 bs2 = a->bs2; 1317 1318 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1319 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1320 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1321 for (i=0; i<mbs; i++) { /* for each block row */ 1322 M = ai[i+1] - ai[i]; 1323 li = l + i*bs; 1324 v = aa + bs2*ai[i]; 1325 for (j=0; j<M; j++) { /* for each block */ 1326 ri = l + bs*aj[ai[i]+j]; 1327 for (k=0; k<bs; k++) { 1328 x = ri[k]; 1329 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1330 } 1331 } 1332 } 1333 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1334 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1335 PetscFunctionReturn(0); 1336 } 1337 1338 #undef __FUNCT__ 1339 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1340 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1341 { 1342 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1343 1344 PetscFunctionBegin; 1345 info->block_size = a->bs2; 1346 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1347 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1348 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1349 info->assemblies = A->num_ass; 1350 info->mallocs = A->info.mallocs; 1351 info->memory = ((PetscObject)A)->mem; 1352 if (A->factortype) { 1353 info->fill_ratio_given = A->info.fill_ratio_given; 1354 info->fill_ratio_needed = A->info.fill_ratio_needed; 1355 info->factor_mallocs = A->info.factor_mallocs; 1356 } else { 1357 info->fill_ratio_given = 0; 1358 info->fill_ratio_needed = 0; 1359 info->factor_mallocs = 0; 1360 } 1361 PetscFunctionReturn(0); 1362 } 1363 1364 1365 #undef __FUNCT__ 1366 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1367 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1368 { 1369 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1370 PetscErrorCode ierr; 1371 1372 PetscFunctionBegin; 1373 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1374 PetscFunctionReturn(0); 1375 } 1376 1377 #undef __FUNCT__ 1378 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1379 /* 1380 This code does not work since it only checks the upper triangular part of 1381 the matrix. Hence it is not listed in the function table. 1382 */ 1383 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1384 { 1385 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1386 PetscErrorCode ierr; 1387 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1388 PetscReal atmp; 1389 MatScalar *aa; 1390 PetscScalar *x; 1391 PetscInt ncols,brow,bcol,krow,kcol; 1392 1393 PetscFunctionBegin; 1394 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1395 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1396 bs = A->rmap->bs; 1397 aa = a->a; 1398 ai = a->i; 1399 aj = a->j; 1400 mbs = a->mbs; 1401 1402 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1403 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1404 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1405 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1406 for (i=0; i<mbs; i++) { 1407 ncols = ai[1] - ai[0]; ai++; 1408 brow = bs*i; 1409 for (j=0; j<ncols; j++) { 1410 bcol = bs*(*aj); 1411 for (kcol=0; kcol<bs; kcol++) { 1412 col = bcol + kcol; /* col index */ 1413 for (krow=0; krow<bs; krow++) { 1414 atmp = PetscAbsScalar(*aa); aa++; 1415 row = brow + krow; /* row index */ 1416 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1417 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1418 } 1419 } 1420 aj++; 1421 } 1422 } 1423 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1424 PetscFunctionReturn(0); 1425 } 1426