1 #define PETSCMAT_DLL 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table,table0; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap.bs; 24 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table0,bcol)){ 71 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt *irow,nrows,*ssmap,bs=A->rmap.bs,bs2=a->bs2; 102 PetscInt *aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscTruth flag; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 110 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ssmap = smap; 117 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 118 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) { 138 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 } 140 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 141 C = *B; 142 } else { 143 ierr = MatCreate(A->comm,&C);CHKERRQ(ierr); 144 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 145 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 146 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 147 } 148 c = (Mat_SeqSBAIJ *)(C->data); 149 for (i=0; i<nrows; i++) { 150 row = irow[i]; 151 kstart = ai[row]; 152 kend = kstart + a->ilen[row]; 153 mat_i = c->i[i]; 154 mat_j = c->j + mat_i; 155 mat_a = c->a + mat_i*bs2; 156 mat_ilen = c->ilen + i; 157 for (k=kstart; k<kend; k++) { 158 if ((tcol=ssmap[a->j[k]])) { 159 *mat_j++ = tcol - 1; 160 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 161 mat_a += bs2; 162 (*mat_ilen)++; 163 } 164 } 165 } 166 167 /* Free work space */ 168 ierr = PetscFree(smap);CHKERRQ(ierr); 169 ierr = PetscFree(lens);CHKERRQ(ierr); 170 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 173 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 174 *B = C; 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 181 { 182 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 183 IS is1; 184 PetscErrorCode ierr; 185 PetscInt *vary,*iary,*irow,nrows,i,bs=A->rmap.bs,count; 186 187 PetscFunctionBegin; 188 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 189 190 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 191 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 192 193 /* Verify if the indices corespond to each element in a block 194 and form the IS with compressed IS */ 195 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 196 iary = vary + a->mbs; 197 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 198 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 199 200 count = 0; 201 for (i=0; i<a->mbs; i++) { 202 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 203 if (vary[i]==bs) iary[count++] = i; 204 } 205 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 206 207 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 208 ierr = PetscFree(vary);CHKERRQ(ierr); 209 210 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 211 ISDestroy(is1); 212 PetscFunctionReturn(0); 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 218 { 219 PetscErrorCode ierr; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (scall == MAT_INITIAL_MATRIX) { 224 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 225 } 226 227 for (i=0; i<n; i++) { 228 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 /* -------------------------------------------------------*/ 234 /* Should check that shapes of vectors and matrices match */ 235 /* -------------------------------------------------------*/ 236 #include "petscblaslapack.h" 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 240 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 241 { 242 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 243 PetscScalar *x,*z,*xb,x1,zero=0.0; 244 MatScalar *v; 245 PetscErrorCode ierr; 246 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 247 248 PetscFunctionBegin; 249 ierr = VecSet(zz,zero);CHKERRQ(ierr); 250 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 251 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 252 253 v = a->a; 254 xb = x; 255 256 for (i=0; i<mbs; i++) { 257 n = ai[1] - ai[0]; /* length of i_th row of A */ 258 x1 = *xb++; 259 ib = aj + *ai++; 260 jmin = 0; 261 /* if we ALWAYS required a diagonal entry then could remove this if test */ 262 /* should we use a tmp to hold the accumulated z[i] */ 263 if (*ib == i) { /* (diag of A)*x */ 264 z[i] += *v++ * x[*ib++]; 265 jmin++; 266 } 267 for (j=jmin; j<n; j++) { 268 cval = *ib; 269 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 270 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 271 } 272 } 273 274 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 275 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 276 ierr = PetscLogFlops(2*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr); /* nz = (nz+m)/2 */ 277 PetscFunctionReturn(0); 278 } 279 280 #undef __FUNCT__ 281 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 282 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 283 { 284 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 285 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 286 MatScalar *v; 287 PetscErrorCode ierr; 288 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 289 290 PetscFunctionBegin; 291 ierr = VecSet(zz,zero);CHKERRQ(ierr); 292 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 293 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 294 295 v = a->a; 296 xb = x; 297 298 for (i=0; i<mbs; i++) { 299 n = ai[1] - ai[0]; /* length of i_th block row of A */ 300 x1 = xb[0]; x2 = xb[1]; 301 ib = aj + *ai; 302 jmin = 0; 303 if (*ib == i){ /* (diag of A)*x */ 304 z[2*i] += v[0]*x1 + v[2]*x2; 305 z[2*i+1] += v[2]*x1 + v[3]*x2; 306 v += 4; jmin++; 307 } 308 for (j=jmin; j<n; j++) { 309 /* (strict lower triangular part of A)*x */ 310 cval = ib[j]*2; 311 z[cval] += v[0]*x1 + v[1]*x2; 312 z[cval+1] += v[2]*x1 + v[3]*x2; 313 /* (strict upper triangular part of A)*x */ 314 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 315 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 316 v += 4; 317 } 318 xb +=2; ai++; 319 } 320 321 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 322 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 323 ierr = PetscLogFlops(8*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 #undef __FUNCT__ 328 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 329 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 330 { 331 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 332 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 333 MatScalar *v; 334 PetscErrorCode ierr; 335 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 336 337 PetscFunctionBegin; 338 ierr = VecSet(zz,zero);CHKERRQ(ierr); 339 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 340 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 341 342 v = a->a; 343 xb = x; 344 345 for (i=0; i<mbs; i++) { 346 n = ai[1] - ai[0]; /* length of i_th block row of A */ 347 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 348 ib = aj + *ai; 349 jmin = 0; 350 if (*ib == i){ /* (diag of A)*x */ 351 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 352 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 353 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 354 v += 9; jmin++; 355 } 356 for (j=jmin; j<n; j++) { 357 /* (strict lower triangular part of A)*x */ 358 cval = ib[j]*3; 359 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 360 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 361 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 362 /* (strict upper triangular part of A)*x */ 363 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 364 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 365 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 366 v += 9; 367 } 368 xb +=3; ai++; 369 } 370 371 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 372 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 373 ierr = PetscLogFlops(18*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr); 374 PetscFunctionReturn(0); 375 } 376 377 #undef __FUNCT__ 378 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 379 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 380 { 381 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 382 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 383 MatScalar *v; 384 PetscErrorCode ierr; 385 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 386 387 PetscFunctionBegin; 388 ierr = VecSet(zz,zero);CHKERRQ(ierr); 389 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 390 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 391 392 v = a->a; 393 xb = x; 394 395 for (i=0; i<mbs; i++) { 396 n = ai[1] - ai[0]; /* length of i_th block row of A */ 397 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 398 ib = aj + *ai; 399 jmin = 0; 400 if (*ib == i){ /* (diag of A)*x */ 401 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 402 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 403 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 404 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 405 v += 16; jmin++; 406 } 407 for (j=jmin; j<n; j++) { 408 /* (strict lower triangular part of A)*x */ 409 cval = ib[j]*4; 410 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 411 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 412 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 413 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 414 /* (strict upper triangular part of A)*x */ 415 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 416 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 417 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 418 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 419 v += 16; 420 } 421 xb +=4; ai++; 422 } 423 424 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 425 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 426 ierr = PetscLogFlops(32*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr); 427 PetscFunctionReturn(0); 428 } 429 430 #undef __FUNCT__ 431 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 432 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 433 { 434 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 435 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 436 MatScalar *v; 437 PetscErrorCode ierr; 438 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 439 440 PetscFunctionBegin; 441 ierr = VecSet(zz,zero);CHKERRQ(ierr); 442 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 443 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 444 445 v = a->a; 446 xb = x; 447 448 for (i=0; i<mbs; i++) { 449 n = ai[1] - ai[0]; /* length of i_th block row of A */ 450 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 451 ib = aj + *ai; 452 jmin = 0; 453 if (*ib == i){ /* (diag of A)*x */ 454 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 455 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 456 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 457 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 458 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 459 v += 25; jmin++; 460 } 461 for (j=jmin; j<n; j++) { 462 /* (strict lower triangular part of A)*x */ 463 cval = ib[j]*5; 464 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 465 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 466 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 467 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 468 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 469 /* (strict upper triangular part of A)*x */ 470 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 471 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 472 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 473 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 474 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 475 v += 25; 476 } 477 xb +=5; ai++; 478 } 479 480 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 481 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 482 ierr = PetscLogFlops(50*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 487 #undef __FUNCT__ 488 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 489 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 490 { 491 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 492 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 493 MatScalar *v; 494 PetscErrorCode ierr; 495 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 496 497 PetscFunctionBegin; 498 ierr = VecSet(zz,zero);CHKERRQ(ierr); 499 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 500 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 501 502 v = a->a; 503 xb = x; 504 505 for (i=0; i<mbs; i++) { 506 n = ai[1] - ai[0]; /* length of i_th block row of A */ 507 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 508 ib = aj + *ai; 509 jmin = 0; 510 if (*ib == i){ /* (diag of A)*x */ 511 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 512 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 513 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 514 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 515 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 516 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 517 v += 36; jmin++; 518 } 519 for (j=jmin; j<n; j++) { 520 /* (strict lower triangular part of A)*x */ 521 cval = ib[j]*6; 522 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 523 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 524 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 525 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 526 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 527 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 528 /* (strict upper triangular part of A)*x */ 529 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 530 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 531 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 532 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 533 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 534 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 535 v += 36; 536 } 537 xb +=6; ai++; 538 } 539 540 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 541 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 542 ierr = PetscLogFlops(72*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr); 543 PetscFunctionReturn(0); 544 } 545 #undef __FUNCT__ 546 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 547 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 548 { 549 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 550 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 551 MatScalar *v; 552 PetscErrorCode ierr; 553 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 554 555 PetscFunctionBegin; 556 ierr = VecSet(zz,zero);CHKERRQ(ierr); 557 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 558 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 559 560 v = a->a; 561 xb = x; 562 563 for (i=0; i<mbs; i++) { 564 n = ai[1] - ai[0]; /* length of i_th block row of A */ 565 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 566 ib = aj + *ai; 567 jmin = 0; 568 if (*ib == i){ /* (diag of A)*x */ 569 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 570 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 571 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 572 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 573 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 574 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 575 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 576 v += 49; jmin++; 577 } 578 for (j=jmin; j<n; j++) { 579 /* (strict lower triangular part of A)*x */ 580 cval = ib[j]*7; 581 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 582 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 583 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 584 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 585 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 586 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 587 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 588 /* (strict upper triangular part of A)*x */ 589 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 590 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 591 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 592 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 593 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 594 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 595 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 596 v += 49; 597 } 598 xb +=7; ai++; 599 } 600 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 601 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 602 ierr = PetscLogFlops(98*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr); 603 PetscFunctionReturn(0); 604 } 605 606 /* 607 This will not work with MatScalar == float because it calls the BLAS 608 */ 609 #undef __FUNCT__ 610 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 611 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 612 { 613 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 614 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 615 MatScalar *v; 616 PetscErrorCode ierr; 617 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k; 618 619 PetscFunctionBegin; 620 ierr = VecSet(zz,zero);CHKERRQ(ierr); 621 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 622 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 623 624 aj = a->j; 625 v = a->a; 626 ii = a->i; 627 628 if (!a->mult_work) { 629 ierr = PetscMalloc((A->rmap.N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 630 } 631 work = a->mult_work; 632 633 for (i=0; i<mbs; i++) { 634 n = ii[1] - ii[0]; ncols = n*bs; 635 workt = work; idx=aj+ii[0]; 636 637 /* upper triangular part */ 638 for (j=0; j<n; j++) { 639 xb = x_ptr + bs*(*idx++); 640 for (k=0; k<bs; k++) workt[k] = xb[k]; 641 workt += bs; 642 } 643 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 644 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 645 646 /* strict lower triangular part */ 647 idx = aj+ii[0]; 648 if (*idx == i){ 649 ncols -= bs; v += bs2; idx++; n--; 650 } 651 652 if (ncols > 0){ 653 workt = work; 654 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 655 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 656 for (j=0; j<n; j++) { 657 zb = z_ptr + bs*(*idx++); 658 for (k=0; k<bs; k++) zb[k] += workt[k] ; 659 workt += bs; 660 } 661 } 662 x += bs; v += n*bs2; z += bs; ii++; 663 } 664 665 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 666 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 667 ierr = PetscLogFlops(2*(a->nz*2 - A->rmap.N)*bs2 - A->rmap.N);CHKERRQ(ierr); 668 PetscFunctionReturn(0); 669 } 670 671 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 672 #undef __FUNCT__ 673 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 674 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 675 { 676 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 677 PetscScalar *x,*z,*xb,x1; 678 MatScalar *v; 679 PetscErrorCode ierr; 680 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 681 682 PetscFunctionBegin; 683 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 684 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 685 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 686 v = a->a; 687 xb = x; 688 689 for (i=0; i<mbs; i++) { 690 n = ai[1] - ai[0]; /* length of i_th row of A */ 691 x1 = xb[0]; 692 ib = aj + *ai; 693 jmin = 0; 694 if (*ib == i) { /* (diag of A)*x */ 695 z[i] += *v++ * x[*ib++]; jmin++; 696 } 697 for (j=jmin; j<n; j++) { 698 cval = *ib; 699 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 700 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 701 } 702 xb++; ai++; 703 } 704 705 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 706 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 707 708 ierr = PetscLogFlops(2*(a->nz*2 - A->rmap.n));CHKERRQ(ierr); 709 PetscFunctionReturn(0); 710 } 711 712 #undef __FUNCT__ 713 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 714 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 715 { 716 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 717 PetscScalar *x,*z,*xb,x1,x2; 718 MatScalar *v; 719 PetscErrorCode ierr; 720 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 721 722 PetscFunctionBegin; 723 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 724 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 725 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 726 727 v = a->a; 728 xb = x; 729 730 for (i=0; i<mbs; i++) { 731 n = ai[1] - ai[0]; /* length of i_th block row of A */ 732 x1 = xb[0]; x2 = xb[1]; 733 ib = aj + *ai; 734 jmin = 0; 735 if (*ib == i){ /* (diag of A)*x */ 736 z[2*i] += v[0]*x1 + v[2]*x2; 737 z[2*i+1] += v[2]*x1 + v[3]*x2; 738 v += 4; jmin++; 739 } 740 for (j=jmin; j<n; j++) { 741 /* (strict lower triangular part of A)*x */ 742 cval = ib[j]*2; 743 z[cval] += v[0]*x1 + v[1]*x2; 744 z[cval+1] += v[2]*x1 + v[3]*x2; 745 /* (strict upper triangular part of A)*x */ 746 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 747 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 748 v += 4; 749 } 750 xb +=2; ai++; 751 } 752 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 753 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 754 755 ierr = PetscLogFlops(4*(a->nz*2 - A->rmap.n));CHKERRQ(ierr); 756 PetscFunctionReturn(0); 757 } 758 759 #undef __FUNCT__ 760 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 761 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 762 { 763 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 764 PetscScalar *x,*z,*xb,x1,x2,x3; 765 MatScalar *v; 766 PetscErrorCode ierr; 767 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 768 769 PetscFunctionBegin; 770 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 771 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 772 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 773 774 v = a->a; 775 xb = x; 776 777 for (i=0; i<mbs; i++) { 778 n = ai[1] - ai[0]; /* length of i_th block row of A */ 779 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 780 ib = aj + *ai; 781 jmin = 0; 782 if (*ib == i){ /* (diag of A)*x */ 783 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 784 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 785 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 786 v += 9; jmin++; 787 } 788 for (j=jmin; j<n; j++) { 789 /* (strict lower triangular part of A)*x */ 790 cval = ib[j]*3; 791 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 792 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 793 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 794 /* (strict upper triangular part of A)*x */ 795 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 796 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 797 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 798 v += 9; 799 } 800 xb +=3; ai++; 801 } 802 803 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 804 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 805 806 ierr = PetscLogFlops(18*(a->nz*2 - A->rmap.n));CHKERRQ(ierr); 807 PetscFunctionReturn(0); 808 } 809 810 #undef __FUNCT__ 811 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 812 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 813 { 814 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 815 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 816 MatScalar *v; 817 PetscErrorCode ierr; 818 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 819 820 PetscFunctionBegin; 821 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 822 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 823 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 824 825 v = a->a; 826 xb = x; 827 828 for (i=0; i<mbs; i++) { 829 n = ai[1] - ai[0]; /* length of i_th block row of A */ 830 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 831 ib = aj + *ai; 832 jmin = 0; 833 if (*ib == i){ /* (diag of A)*x */ 834 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 835 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 836 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 837 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 838 v += 16; jmin++; 839 } 840 for (j=jmin; j<n; j++) { 841 /* (strict lower triangular part of A)*x */ 842 cval = ib[j]*4; 843 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 844 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 845 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 846 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 847 /* (strict upper triangular part of A)*x */ 848 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 849 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 850 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 851 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 852 v += 16; 853 } 854 xb +=4; ai++; 855 } 856 857 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 858 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 859 860 ierr = PetscLogFlops(32*(a->nz*2 - A->rmap.n));CHKERRQ(ierr); 861 PetscFunctionReturn(0); 862 } 863 864 #undef __FUNCT__ 865 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 866 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 867 { 868 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 869 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 870 MatScalar *v; 871 PetscErrorCode ierr; 872 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 873 874 PetscFunctionBegin; 875 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 876 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 877 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 878 879 v = a->a; 880 xb = x; 881 882 for (i=0; i<mbs; i++) { 883 n = ai[1] - ai[0]; /* length of i_th block row of A */ 884 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 885 ib = aj + *ai; 886 jmin = 0; 887 if (*ib == i){ /* (diag of A)*x */ 888 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 889 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 890 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 891 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 892 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 893 v += 25; jmin++; 894 } 895 for (j=jmin; j<n; j++) { 896 /* (strict lower triangular part of A)*x */ 897 cval = ib[j]*5; 898 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 899 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 900 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 901 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 902 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 903 /* (strict upper triangular part of A)*x */ 904 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 905 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 906 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 907 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 908 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 909 v += 25; 910 } 911 xb +=5; ai++; 912 } 913 914 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 915 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 916 917 ierr = PetscLogFlops(50*(a->nz*2 - A->rmap.n));CHKERRQ(ierr); 918 PetscFunctionReturn(0); 919 } 920 #undef __FUNCT__ 921 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 922 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 923 { 924 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 925 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 926 MatScalar *v; 927 PetscErrorCode ierr; 928 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 929 930 PetscFunctionBegin; 931 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 932 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 933 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 934 935 v = a->a; 936 xb = x; 937 938 for (i=0; i<mbs; i++) { 939 n = ai[1] - ai[0]; /* length of i_th block row of A */ 940 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 941 ib = aj + *ai; 942 jmin = 0; 943 if (*ib == i){ /* (diag of A)*x */ 944 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 945 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 946 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 947 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 948 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 949 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 950 v += 36; jmin++; 951 } 952 for (j=jmin; j<n; j++) { 953 /* (strict lower triangular part of A)*x */ 954 cval = ib[j]*6; 955 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 956 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 957 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 958 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 959 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 960 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 961 /* (strict upper triangular part of A)*x */ 962 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 963 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 964 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 965 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 966 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 967 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 968 v += 36; 969 } 970 xb +=6; ai++; 971 } 972 973 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 974 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 975 976 ierr = PetscLogFlops(72*(a->nz*2 - A->rmap.n));CHKERRQ(ierr); 977 PetscFunctionReturn(0); 978 } 979 980 #undef __FUNCT__ 981 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 982 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 983 { 984 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 985 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 986 MatScalar *v; 987 PetscErrorCode ierr; 988 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 989 990 PetscFunctionBegin; 991 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 992 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 993 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 994 995 v = a->a; 996 xb = x; 997 998 for (i=0; i<mbs; i++) { 999 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1000 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1001 ib = aj + *ai; 1002 jmin = 0; 1003 if (*ib == i){ /* (diag of A)*x */ 1004 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1005 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1006 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1007 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1008 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1009 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1010 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1011 v += 49; jmin++; 1012 } 1013 for (j=jmin; j<n; j++) { 1014 /* (strict lower triangular part of A)*x */ 1015 cval = ib[j]*7; 1016 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1017 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1018 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1019 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1020 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1021 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1022 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1023 /* (strict upper triangular part of A)*x */ 1024 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1025 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1026 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1027 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1028 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1029 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1030 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1031 v += 49; 1032 } 1033 xb +=7; ai++; 1034 } 1035 1036 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1037 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1038 1039 ierr = PetscLogFlops(98*(a->nz*2 - A->rmap.n));CHKERRQ(ierr); 1040 PetscFunctionReturn(0); 1041 } 1042 1043 #undef __FUNCT__ 1044 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1045 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1046 { 1047 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1048 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1049 MatScalar *v; 1050 PetscErrorCode ierr; 1051 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k; 1052 1053 PetscFunctionBegin; 1054 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1055 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1056 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1057 1058 aj = a->j; 1059 v = a->a; 1060 ii = a->i; 1061 1062 if (!a->mult_work) { 1063 ierr = PetscMalloc((A->rmap.n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1064 } 1065 work = a->mult_work; 1066 1067 1068 for (i=0; i<mbs; i++) { 1069 n = ii[1] - ii[0]; ncols = n*bs; 1070 workt = work; idx=aj+ii[0]; 1071 1072 /* upper triangular part */ 1073 for (j=0; j<n; j++) { 1074 xb = x_ptr + bs*(*idx++); 1075 for (k=0; k<bs; k++) workt[k] = xb[k]; 1076 workt += bs; 1077 } 1078 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1079 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1080 1081 /* strict lower triangular part */ 1082 idx = aj+ii[0]; 1083 if (*idx == i){ 1084 ncols -= bs; v += bs2; idx++; n--; 1085 } 1086 if (ncols > 0){ 1087 workt = work; 1088 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1089 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1090 for (j=0; j<n; j++) { 1091 zb = z_ptr + bs*(*idx++); 1092 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1093 workt += bs; 1094 } 1095 } 1096 1097 x += bs; v += n*bs2; z += bs; ii++; 1098 } 1099 1100 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1101 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1102 1103 ierr = PetscLogFlops(2*(a->nz*2 - A->rmap.n));CHKERRQ(ierr); 1104 PetscFunctionReturn(0); 1105 } 1106 1107 #undef __FUNCT__ 1108 #define __FUNCT__ "MatScale_SeqSBAIJ" 1109 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1110 { 1111 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1112 PetscScalar oalpha = alpha; 1113 PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz; 1114 PetscErrorCode ierr; 1115 1116 PetscFunctionBegin; 1117 BLASscal_(&totalnz,&oalpha,a->a,&one); 1118 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1119 PetscFunctionReturn(0); 1120 } 1121 1122 #undef __FUNCT__ 1123 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1124 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1125 { 1126 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1127 MatScalar *v = a->a; 1128 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1129 PetscInt i,j,k,bs = A->rmap.bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1130 PetscErrorCode ierr; 1131 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1132 1133 PetscFunctionBegin; 1134 if (type == NORM_FROBENIUS) { 1135 for (k=0; k<mbs; k++){ 1136 jmin = a->i[k]; jmax = a->i[k+1]; 1137 col = aj + jmin; 1138 if (*col == k){ /* diagonal block */ 1139 for (i=0; i<bs2; i++){ 1140 #if defined(PETSC_USE_COMPLEX) 1141 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1142 #else 1143 sum_diag += (*v)*(*v); v++; 1144 #endif 1145 } 1146 jmin++; 1147 } 1148 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1149 for (i=0; i<bs2; i++){ 1150 #if defined(PETSC_USE_COMPLEX) 1151 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1152 #else 1153 sum_off += (*v)*(*v); v++; 1154 #endif 1155 } 1156 } 1157 } 1158 *norm = sqrt(sum_diag + 2*sum_off); 1159 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1160 ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr); 1161 jl = il + mbs; 1162 sum = (PetscReal*)(jl + mbs); 1163 for (i=0; i<mbs; i++) jl[i] = mbs; 1164 il[0] = 0; 1165 1166 *norm = 0.0; 1167 for (k=0; k<mbs; k++) { /* k_th block row */ 1168 for (j=0; j<bs; j++) sum[j]=0.0; 1169 /*-- col sum --*/ 1170 i = jl[k]; /* first |A(i,k)| to be added */ 1171 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1172 at step k */ 1173 while (i<mbs){ 1174 nexti = jl[i]; /* next block row to be added */ 1175 ik = il[i]; /* block index of A(i,k) in the array a */ 1176 for (j=0; j<bs; j++){ 1177 v = a->a + ik*bs2 + j*bs; 1178 for (k1=0; k1<bs; k1++) { 1179 sum[j] += PetscAbsScalar(*v); v++; 1180 } 1181 } 1182 /* update il, jl */ 1183 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1184 jmax = a->i[i+1]; 1185 if (jmin < jmax){ 1186 il[i] = jmin; 1187 j = a->j[jmin]; 1188 jl[i] = jl[j]; jl[j]=i; 1189 } 1190 i = nexti; 1191 } 1192 /*-- row sum --*/ 1193 jmin = a->i[k]; jmax = a->i[k+1]; 1194 for (i=jmin; i<jmax; i++) { 1195 for (j=0; j<bs; j++){ 1196 v = a->a + i*bs2 + j; 1197 for (k1=0; k1<bs; k1++){ 1198 sum[j] += PetscAbsScalar(*v); v += bs; 1199 } 1200 } 1201 } 1202 /* add k_th block row to il, jl */ 1203 col = aj+jmin; 1204 if (*col == k) jmin++; 1205 if (jmin < jmax){ 1206 il[k] = jmin; 1207 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1208 } 1209 for (j=0; j<bs; j++){ 1210 if (sum[j] > *norm) *norm = sum[j]; 1211 } 1212 } 1213 ierr = PetscFree(il);CHKERRQ(ierr); 1214 } else { 1215 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1216 } 1217 PetscFunctionReturn(0); 1218 } 1219 1220 #undef __FUNCT__ 1221 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1222 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1223 { 1224 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1225 PetscErrorCode ierr; 1226 1227 PetscFunctionBegin; 1228 1229 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1230 if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) { 1231 *flg = PETSC_FALSE; 1232 PetscFunctionReturn(0); 1233 } 1234 1235 /* if the a->i are the same */ 1236 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1237 if (!*flg) { 1238 PetscFunctionReturn(0); 1239 } 1240 1241 /* if a->j are the same */ 1242 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1243 if (!*flg) { 1244 PetscFunctionReturn(0); 1245 } 1246 /* if a->a are the same */ 1247 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(A->rmap.bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1248 PetscFunctionReturn(0); 1249 } 1250 1251 #undef __FUNCT__ 1252 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1253 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1254 { 1255 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1256 PetscErrorCode ierr; 1257 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1258 PetscScalar *x,zero = 0.0; 1259 MatScalar *aa,*aa_j; 1260 1261 PetscFunctionBegin; 1262 bs = A->rmap.bs; 1263 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1264 1265 aa = a->a; 1266 ai = a->i; 1267 aj = a->j; 1268 ambs = a->mbs; 1269 bs2 = a->bs2; 1270 1271 ierr = VecSet(v,zero);CHKERRQ(ierr); 1272 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1273 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1274 if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1275 for (i=0; i<ambs; i++) { 1276 j=ai[i]; 1277 if (aj[j] == i) { /* if this is a diagonal element */ 1278 row = i*bs; 1279 aa_j = aa + j*bs2; 1280 if (A->factor && bs==1){ 1281 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1282 } else { 1283 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1284 } 1285 } 1286 } 1287 1288 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1289 PetscFunctionReturn(0); 1290 } 1291 1292 #undef __FUNCT__ 1293 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1294 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1295 { 1296 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1297 PetscScalar *l,x,*li,*ri; 1298 MatScalar *aa,*v; 1299 PetscErrorCode ierr; 1300 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1301 PetscTruth flg; 1302 1303 PetscFunctionBegin; 1304 if (ll != rr){ 1305 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1306 if (!flg) 1307 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1308 } 1309 if (!ll) PetscFunctionReturn(0); 1310 ai = a->i; 1311 aj = a->j; 1312 aa = a->a; 1313 m = A->rmap.N; 1314 bs = A->rmap.bs; 1315 mbs = a->mbs; 1316 bs2 = a->bs2; 1317 1318 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1319 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1320 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1321 for (i=0; i<mbs; i++) { /* for each block row */ 1322 M = ai[i+1] - ai[i]; 1323 li = l + i*bs; 1324 v = aa + bs2*ai[i]; 1325 for (j=0; j<M; j++) { /* for each block */ 1326 ri = l + bs*aj[ai[i]+j]; 1327 for (k=0; k<bs; k++) { 1328 x = ri[k]; 1329 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1330 } 1331 } 1332 } 1333 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1334 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 1335 PetscFunctionReturn(0); 1336 } 1337 1338 #undef __FUNCT__ 1339 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1340 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1341 { 1342 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1343 1344 PetscFunctionBegin; 1345 info->rows_global = (double)A->rmap.N; 1346 info->columns_global = (double)A->rmap.N; 1347 info->rows_local = (double)A->rmap.N; 1348 info->columns_local = (double)A->rmap.N; 1349 info->block_size = a->bs2; 1350 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1351 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1352 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1353 info->assemblies = A->num_ass; 1354 info->mallocs = a->reallocs; 1355 info->memory = A->mem; 1356 if (A->factor) { 1357 info->fill_ratio_given = A->info.fill_ratio_given; 1358 info->fill_ratio_needed = A->info.fill_ratio_needed; 1359 info->factor_mallocs = A->info.factor_mallocs; 1360 } else { 1361 info->fill_ratio_given = 0; 1362 info->fill_ratio_needed = 0; 1363 info->factor_mallocs = 0; 1364 } 1365 PetscFunctionReturn(0); 1366 } 1367 1368 1369 #undef __FUNCT__ 1370 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1371 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1372 { 1373 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1374 PetscErrorCode ierr; 1375 1376 PetscFunctionBegin; 1377 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1378 PetscFunctionReturn(0); 1379 } 1380 1381 #undef __FUNCT__ 1382 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1383 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1384 { 1385 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1386 PetscErrorCode ierr; 1387 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1388 PetscReal atmp; 1389 MatScalar *aa; 1390 PetscScalar zero = 0.0,*x; 1391 PetscInt ncols,brow,bcol,krow,kcol; 1392 1393 PetscFunctionBegin; 1394 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1395 bs = A->rmap.bs; 1396 aa = a->a; 1397 ai = a->i; 1398 aj = a->j; 1399 mbs = a->mbs; 1400 1401 ierr = VecSet(v,zero);CHKERRQ(ierr); 1402 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1403 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1404 if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1405 for (i=0; i<mbs; i++) { 1406 ncols = ai[1] - ai[0]; ai++; 1407 brow = bs*i; 1408 for (j=0; j<ncols; j++){ 1409 bcol = bs*(*aj); 1410 for (kcol=0; kcol<bs; kcol++){ 1411 col = bcol + kcol; /* col index */ 1412 for (krow=0; krow<bs; krow++){ 1413 atmp = PetscAbsScalar(*aa); aa++; 1414 row = brow + krow; /* row index */ 1415 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1416 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1417 } 1418 } 1419 aj++; 1420 } 1421 } 1422 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1423 PetscFunctionReturn(0); 1424 } 1425