1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/inline/spops.h" 5 #include "../src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "../src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 const PetscInt *idx; 17 PetscBT table,table0; 18 19 PetscFunctionBegin; 20 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 21 mbs = a->mbs; 22 ai = a->i; 23 aj = a->j; 24 bs = A->rmap->bs; 25 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 26 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 27 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 28 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { /* for each is */ 31 isz = 0; 32 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 33 34 /* Extract the indices, assume there can be duplicate entries */ 35 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 36 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 37 38 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 39 bcol_max = 0; 40 for (j=0; j<n ; ++j){ 41 brow = idx[j]/bs; /* convert the indices into block indices */ 42 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,brow)) { 44 nidx[isz++] = brow; 45 if (bcol_max < brow) bcol_max = brow; 46 } 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++){ /* for each overlap */ 53 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 54 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 55 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 56 57 n = isz; /* length of the updated is[i] */ 58 for (brow=0; brow<mbs; brow++){ 59 start = ai[brow]; end = ai[brow+1]; 60 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 61 for (l = start; l<end ; l++){ 62 bcol = aj[l]; 63 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end ; l++){ 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table0,bcol)){ 72 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) 83 nidx2[j*bs+k] = nidx[j]*bs+k; 84 } 85 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 86 } 87 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 88 ierr = PetscFree(nidx);CHKERRQ(ierr); 89 ierr = PetscFree(nidx2);CHKERRQ(ierr); 90 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 #undef __FUNCT__ 95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 97 { 98 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 99 PetscErrorCode ierr; 100 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 101 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 102 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 103 const PetscInt *irow,*aj = a->j,*ai = a->i; 104 MatScalar *mat_a; 105 Mat C; 106 PetscTruth flag,sorted; 107 108 PetscFunctionBegin; 109 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 110 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 111 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 112 113 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 114 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 115 116 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 117 ssmap = smap; 118 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 119 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 120 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 121 /* determine lens of each row */ 122 for (i=0; i<nrows; i++) { 123 kstart = ai[irow[i]]; 124 kend = kstart + a->ilen[irow[i]]; 125 lens[i] = 0; 126 for (k=kstart; k<kend; k++) { 127 if (ssmap[aj[k]]) { 128 lens[i]++; 129 } 130 } 131 } 132 /* Create and fill new matrix */ 133 if (scall == MAT_REUSE_MATRIX) { 134 c = (Mat_SeqSBAIJ *)((*B)->data); 135 136 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 137 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 138 if (!flag) { 139 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 140 } 141 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 142 C = *B; 143 } else { 144 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 145 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 146 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 147 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 148 } 149 c = (Mat_SeqSBAIJ *)(C->data); 150 for (i=0; i<nrows; i++) { 151 row = irow[i]; 152 kstart = ai[row]; 153 kend = kstart + a->ilen[row]; 154 mat_i = c->i[i]; 155 mat_j = c->j + mat_i; 156 mat_a = c->a + mat_i*bs2; 157 mat_ilen = c->ilen + i; 158 for (k=kstart; k<kend; k++) { 159 if ((tcol=ssmap[a->j[k]])) { 160 *mat_j++ = tcol - 1; 161 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 162 mat_a += bs2; 163 (*mat_ilen)++; 164 } 165 } 166 } 167 168 /* Free work space */ 169 ierr = PetscFree(smap);CHKERRQ(ierr); 170 ierr = PetscFree(lens);CHKERRQ(ierr); 171 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 173 174 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 175 *B = C; 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 182 { 183 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 184 IS is1; 185 PetscErrorCode ierr; 186 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 187 const PetscInt *irow; 188 189 PetscFunctionBegin; 190 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 191 192 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 193 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 194 195 /* Verify if the indices corespond to each element in a block 196 and form the IS with compressed IS */ 197 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 198 iary = vary + a->mbs; 199 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 200 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 201 202 count = 0; 203 for (i=0; i<a->mbs; i++) { 204 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 205 if (vary[i]==bs) iary[count++] = i; 206 } 207 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 208 209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210 ierr = PetscFree(vary);CHKERRQ(ierr); 211 212 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 213 ISDestroy(is1); 214 PetscFunctionReturn(0); 215 } 216 217 #undef __FUNCT__ 218 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 219 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 220 { 221 PetscErrorCode ierr; 222 PetscInt i; 223 224 PetscFunctionBegin; 225 if (scall == MAT_INITIAL_MATRIX) { 226 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 227 } 228 229 for (i=0; i<n; i++) { 230 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 231 } 232 PetscFunctionReturn(0); 233 } 234 235 /* -------------------------------------------------------*/ 236 /* Should check that shapes of vectors and matrices match */ 237 /* -------------------------------------------------------*/ 238 #include "petscblaslapack.h" 239 240 #undef __FUNCT__ 241 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 242 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 243 { 244 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 245 PetscScalar *x,*z,*xb,x1,zero=0.0; 246 MatScalar *v; 247 PetscErrorCode ierr; 248 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 249 PetscInt nonzerorow=0; 250 251 PetscFunctionBegin; 252 ierr = VecSet(zz,zero);CHKERRQ(ierr); 253 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 254 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 255 256 v = a->a; 257 xb = x; 258 259 for (i=0; i<mbs; i++) { 260 n = ai[1] - ai[0]; /* length of i_th row of A */ 261 x1 = *xb++; 262 ib = aj + *ai++; 263 jmin = 0; 264 nonzerorow += (n>0); 265 /* if we ALWAYS required a diagonal entry then could remove this if test */ 266 /* should we use a tmp to hold the accumulated z[i] */ 267 if (*ib == i) { /* (diag of A)*x */ 268 z[i] += *v++ * x[*ib++]; 269 jmin++; 270 } 271 for (j=jmin; j<n; j++) { 272 cval = *ib; 273 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 274 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 275 } 276 } 277 278 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 279 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 280 ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); /* nz = (nz+m)/2 */ 281 PetscFunctionReturn(0); 282 } 283 284 #undef __FUNCT__ 285 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 286 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 287 { 288 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 289 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 290 MatScalar *v; 291 PetscErrorCode ierr; 292 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 293 PetscInt nonzerorow=0; 294 295 PetscFunctionBegin; 296 ierr = VecSet(zz,zero);CHKERRQ(ierr); 297 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 298 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 299 300 v = a->a; 301 xb = x; 302 303 for (i=0; i<mbs; i++) { 304 n = ai[1] - ai[0]; /* length of i_th block row of A */ 305 x1 = xb[0]; x2 = xb[1]; 306 ib = aj + *ai; 307 jmin = 0; 308 nonzerorow += (n>0); 309 if (*ib == i){ /* (diag of A)*x */ 310 z[2*i] += v[0]*x1 + v[2]*x2; 311 z[2*i+1] += v[2]*x1 + v[3]*x2; 312 v += 4; jmin++; 313 } 314 for (j=jmin; j<n; j++) { 315 /* (strict lower triangular part of A)*x */ 316 cval = ib[j]*2; 317 z[cval] += v[0]*x1 + v[1]*x2; 318 z[cval+1] += v[2]*x1 + v[3]*x2; 319 /* (strict upper triangular part of A)*x */ 320 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 321 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 322 v += 4; 323 } 324 xb +=2; ai++; 325 } 326 327 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 328 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 329 ierr = PetscLogFlops(8*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 330 PetscFunctionReturn(0); 331 } 332 333 #undef __FUNCT__ 334 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 335 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 336 { 337 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 338 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 339 MatScalar *v; 340 PetscErrorCode ierr; 341 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 342 PetscInt nonzerorow=0; 343 344 PetscFunctionBegin; 345 ierr = VecSet(zz,zero);CHKERRQ(ierr); 346 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 347 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 348 349 v = a->a; 350 xb = x; 351 352 for (i=0; i<mbs; i++) { 353 n = ai[1] - ai[0]; /* length of i_th block row of A */ 354 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 355 ib = aj + *ai; 356 jmin = 0; 357 nonzerorow += (n>0); 358 if (*ib == i){ /* (diag of A)*x */ 359 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 360 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 361 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 362 v += 9; jmin++; 363 } 364 for (j=jmin; j<n; j++) { 365 /* (strict lower triangular part of A)*x */ 366 cval = ib[j]*3; 367 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 368 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 369 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 370 /* (strict upper triangular part of A)*x */ 371 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 372 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 373 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 374 v += 9; 375 } 376 xb +=3; ai++; 377 } 378 379 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 381 ierr = PetscLogFlops(18*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 #undef __FUNCT__ 386 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 387 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 388 { 389 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 390 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 391 MatScalar *v; 392 PetscErrorCode ierr; 393 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 394 PetscInt nonzerorow=0; 395 396 PetscFunctionBegin; 397 ierr = VecSet(zz,zero);CHKERRQ(ierr); 398 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 399 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 400 401 v = a->a; 402 xb = x; 403 404 for (i=0; i<mbs; i++) { 405 n = ai[1] - ai[0]; /* length of i_th block row of A */ 406 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 407 ib = aj + *ai; 408 jmin = 0; 409 nonzerorow += (n>0); 410 if (*ib == i){ /* (diag of A)*x */ 411 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 412 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 413 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 414 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 415 v += 16; jmin++; 416 } 417 for (j=jmin; j<n; j++) { 418 /* (strict lower triangular part of A)*x */ 419 cval = ib[j]*4; 420 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 421 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 422 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 423 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 424 /* (strict upper triangular part of A)*x */ 425 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 426 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 427 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 428 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 429 v += 16; 430 } 431 xb +=4; ai++; 432 } 433 434 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 435 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 436 ierr = PetscLogFlops(32*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 437 PetscFunctionReturn(0); 438 } 439 440 #undef __FUNCT__ 441 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 442 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 443 { 444 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 445 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 446 MatScalar *v; 447 PetscErrorCode ierr; 448 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 449 PetscInt nonzerorow=0; 450 451 PetscFunctionBegin; 452 ierr = VecSet(zz,zero);CHKERRQ(ierr); 453 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 454 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 455 456 v = a->a; 457 xb = x; 458 459 for (i=0; i<mbs; i++) { 460 n = ai[1] - ai[0]; /* length of i_th block row of A */ 461 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 462 ib = aj + *ai; 463 jmin = 0; 464 nonzerorow += (n>0); 465 if (*ib == i){ /* (diag of A)*x */ 466 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 467 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 468 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 469 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 470 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 471 v += 25; jmin++; 472 } 473 for (j=jmin; j<n; j++) { 474 /* (strict lower triangular part of A)*x */ 475 cval = ib[j]*5; 476 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 477 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 478 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 479 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 480 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 481 /* (strict upper triangular part of A)*x */ 482 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 483 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 484 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 485 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 486 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 487 v += 25; 488 } 489 xb +=5; ai++; 490 } 491 492 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 493 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 494 ierr = PetscLogFlops(50*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 499 #undef __FUNCT__ 500 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 501 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 502 { 503 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 504 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 505 MatScalar *v; 506 PetscErrorCode ierr; 507 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 508 PetscInt nonzerorow=0; 509 510 PetscFunctionBegin; 511 ierr = VecSet(zz,zero);CHKERRQ(ierr); 512 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 513 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 514 515 v = a->a; 516 xb = x; 517 518 for (i=0; i<mbs; i++) { 519 n = ai[1] - ai[0]; /* length of i_th block row of A */ 520 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 521 ib = aj + *ai; 522 jmin = 0; 523 nonzerorow += (n>0); 524 if (*ib == i){ /* (diag of A)*x */ 525 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 526 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 527 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 528 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 529 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 530 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 531 v += 36; jmin++; 532 } 533 for (j=jmin; j<n; j++) { 534 /* (strict lower triangular part of A)*x */ 535 cval = ib[j]*6; 536 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 537 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 538 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 539 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 540 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 541 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 542 /* (strict upper triangular part of A)*x */ 543 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 544 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 545 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 546 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 547 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 548 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 549 v += 36; 550 } 551 xb +=6; ai++; 552 } 553 554 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 555 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 556 ierr = PetscLogFlops(72*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 557 PetscFunctionReturn(0); 558 } 559 #undef __FUNCT__ 560 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 561 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 562 { 563 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 564 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 565 MatScalar *v; 566 PetscErrorCode ierr; 567 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 568 PetscInt nonzerorow=0; 569 570 PetscFunctionBegin; 571 ierr = VecSet(zz,zero);CHKERRQ(ierr); 572 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 573 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 574 575 v = a->a; 576 xb = x; 577 578 for (i=0; i<mbs; i++) { 579 n = ai[1] - ai[0]; /* length of i_th block row of A */ 580 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 581 ib = aj + *ai; 582 jmin = 0; 583 nonzerorow += (n>0); 584 if (*ib == i){ /* (diag of A)*x */ 585 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 586 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 587 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 588 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 589 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 590 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 591 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 592 v += 49; jmin++; 593 } 594 for (j=jmin; j<n; j++) { 595 /* (strict lower triangular part of A)*x */ 596 cval = ib[j]*7; 597 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 598 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 599 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 600 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 601 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 602 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 603 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 604 /* (strict upper triangular part of A)*x */ 605 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 606 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 607 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 608 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 609 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 610 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 611 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 612 v += 49; 613 } 614 xb +=7; ai++; 615 } 616 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 617 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 618 ierr = PetscLogFlops(98*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 619 PetscFunctionReturn(0); 620 } 621 622 /* 623 This will not work with MatScalar == float because it calls the BLAS 624 */ 625 #undef __FUNCT__ 626 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 627 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 628 { 629 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 630 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 631 MatScalar *v; 632 PetscErrorCode ierr; 633 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 634 PetscInt nonzerorow=0; 635 636 PetscFunctionBegin; 637 ierr = VecSet(zz,zero);CHKERRQ(ierr); 638 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 639 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 640 641 aj = a->j; 642 v = a->a; 643 ii = a->i; 644 645 if (!a->mult_work) { 646 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 647 } 648 work = a->mult_work; 649 650 for (i=0; i<mbs; i++) { 651 n = ii[1] - ii[0]; ncols = n*bs; 652 workt = work; idx=aj+ii[0]; 653 nonzerorow += (n>0); 654 655 /* upper triangular part */ 656 for (j=0; j<n; j++) { 657 xb = x_ptr + bs*(*idx++); 658 for (k=0; k<bs; k++) workt[k] = xb[k]; 659 workt += bs; 660 } 661 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 662 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 663 664 /* strict lower triangular part */ 665 idx = aj+ii[0]; 666 if (*idx == i){ 667 ncols -= bs; v += bs2; idx++; n--; 668 } 669 670 if (ncols > 0){ 671 workt = work; 672 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 673 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 674 for (j=0; j<n; j++) { 675 zb = z_ptr + bs*(*idx++); 676 for (k=0; k<bs; k++) zb[k] += workt[k] ; 677 workt += bs; 678 } 679 } 680 x += bs; v += n*bs2; z += bs; ii++; 681 } 682 683 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 684 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 685 ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 686 PetscFunctionReturn(0); 687 } 688 689 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 690 #undef __FUNCT__ 691 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 692 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 693 { 694 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 695 PetscScalar *x,*z,*xb,x1; 696 MatScalar *v; 697 PetscErrorCode ierr; 698 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 699 PetscInt nonzerorow=0; 700 701 PetscFunctionBegin; 702 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 703 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 704 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 705 v = a->a; 706 xb = x; 707 708 for (i=0; i<mbs; i++) { 709 n = ai[1] - ai[0]; /* length of i_th row of A */ 710 x1 = xb[0]; 711 ib = aj + *ai; 712 jmin = 0; 713 nonzerorow += (n>0); 714 if (*ib == i) { /* (diag of A)*x */ 715 z[i] += *v++ * x[*ib++]; jmin++; 716 } 717 for (j=jmin; j<n; j++) { 718 cval = *ib; 719 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 720 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 721 } 722 xb++; ai++; 723 } 724 725 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 726 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 727 728 ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 729 PetscFunctionReturn(0); 730 } 731 732 #undef __FUNCT__ 733 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 734 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 735 { 736 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 737 PetscScalar *x,*z,*xb,x1,x2; 738 MatScalar *v; 739 PetscErrorCode ierr; 740 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 741 PetscInt nonzerorow=0; 742 743 PetscFunctionBegin; 744 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 745 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 746 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 747 748 v = a->a; 749 xb = x; 750 751 for (i=0; i<mbs; i++) { 752 n = ai[1] - ai[0]; /* length of i_th block row of A */ 753 x1 = xb[0]; x2 = xb[1]; 754 ib = aj + *ai; 755 jmin = 0; 756 nonzerorow += (n>0); 757 if (*ib == i){ /* (diag of A)*x */ 758 z[2*i] += v[0]*x1 + v[2]*x2; 759 z[2*i+1] += v[2]*x1 + v[3]*x2; 760 v += 4; jmin++; 761 } 762 for (j=jmin; j<n; j++) { 763 /* (strict lower triangular part of A)*x */ 764 cval = ib[j]*2; 765 z[cval] += v[0]*x1 + v[1]*x2; 766 z[cval+1] += v[2]*x1 + v[3]*x2; 767 /* (strict upper triangular part of A)*x */ 768 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 769 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 770 v += 4; 771 } 772 xb +=2; ai++; 773 } 774 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 775 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 776 777 ierr = PetscLogFlops(4*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 778 PetscFunctionReturn(0); 779 } 780 781 #undef __FUNCT__ 782 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 783 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 784 { 785 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 786 PetscScalar *x,*z,*xb,x1,x2,x3; 787 MatScalar *v; 788 PetscErrorCode ierr; 789 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 790 PetscInt nonzerorow=0; 791 792 PetscFunctionBegin; 793 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 794 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 795 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 796 797 v = a->a; 798 xb = x; 799 800 for (i=0; i<mbs; i++) { 801 n = ai[1] - ai[0]; /* length of i_th block row of A */ 802 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 803 ib = aj + *ai; 804 jmin = 0; 805 nonzerorow += (n>0); 806 if (*ib == i){ /* (diag of A)*x */ 807 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 808 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 809 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 810 v += 9; jmin++; 811 } 812 for (j=jmin; j<n; j++) { 813 /* (strict lower triangular part of A)*x */ 814 cval = ib[j]*3; 815 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 816 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 817 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 818 /* (strict upper triangular part of A)*x */ 819 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 820 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 821 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 822 v += 9; 823 } 824 xb +=3; ai++; 825 } 826 827 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 828 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 829 830 ierr = PetscLogFlops(18*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 831 PetscFunctionReturn(0); 832 } 833 834 #undef __FUNCT__ 835 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 836 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 837 { 838 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 839 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 840 MatScalar *v; 841 PetscErrorCode ierr; 842 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 843 PetscInt nonzerorow=0; 844 845 PetscFunctionBegin; 846 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 847 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 848 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 849 850 v = a->a; 851 xb = x; 852 853 for (i=0; i<mbs; i++) { 854 n = ai[1] - ai[0]; /* length of i_th block row of A */ 855 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 856 ib = aj + *ai; 857 jmin = 0; 858 nonzerorow += (n>0); 859 if (*ib == i){ /* (diag of A)*x */ 860 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 861 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 862 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 863 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 864 v += 16; jmin++; 865 } 866 for (j=jmin; j<n; j++) { 867 /* (strict lower triangular part of A)*x */ 868 cval = ib[j]*4; 869 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 870 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 871 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 872 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 873 /* (strict upper triangular part of A)*x */ 874 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 875 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 876 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 877 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 878 v += 16; 879 } 880 xb +=4; ai++; 881 } 882 883 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 884 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 885 886 ierr = PetscLogFlops(32*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 887 PetscFunctionReturn(0); 888 } 889 890 #undef __FUNCT__ 891 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 892 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 893 { 894 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 895 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 896 MatScalar *v; 897 PetscErrorCode ierr; 898 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 899 PetscInt nonzerorow=0; 900 901 PetscFunctionBegin; 902 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 903 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 904 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 905 906 v = a->a; 907 xb = x; 908 909 for (i=0; i<mbs; i++) { 910 n = ai[1] - ai[0]; /* length of i_th block row of A */ 911 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 912 ib = aj + *ai; 913 jmin = 0; 914 nonzerorow += (n>0); 915 if (*ib == i){ /* (diag of A)*x */ 916 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 917 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 918 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 919 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 920 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 921 v += 25; jmin++; 922 } 923 for (j=jmin; j<n; j++) { 924 /* (strict lower triangular part of A)*x */ 925 cval = ib[j]*5; 926 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 927 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 928 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 929 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 930 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 931 /* (strict upper triangular part of A)*x */ 932 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 933 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 934 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 935 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 936 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 937 v += 25; 938 } 939 xb +=5; ai++; 940 } 941 942 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 943 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 944 945 ierr = PetscLogFlops(50*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 946 PetscFunctionReturn(0); 947 } 948 #undef __FUNCT__ 949 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 950 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 951 { 952 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 953 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 954 MatScalar *v; 955 PetscErrorCode ierr; 956 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 957 PetscInt nonzerorow=0; 958 959 PetscFunctionBegin; 960 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 961 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 962 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 963 964 v = a->a; 965 xb = x; 966 967 for (i=0; i<mbs; i++) { 968 n = ai[1] - ai[0]; /* length of i_th block row of A */ 969 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 970 ib = aj + *ai; 971 jmin = 0; 972 nonzerorow += (n>0); 973 if (*ib == i){ /* (diag of A)*x */ 974 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 975 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 976 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 977 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 978 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 979 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 980 v += 36; jmin++; 981 } 982 for (j=jmin; j<n; j++) { 983 /* (strict lower triangular part of A)*x */ 984 cval = ib[j]*6; 985 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 986 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 987 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 988 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 989 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 990 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 991 /* (strict upper triangular part of A)*x */ 992 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 993 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 994 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 995 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 996 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 997 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 998 v += 36; 999 } 1000 xb +=6; ai++; 1001 } 1002 1003 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1004 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1005 1006 ierr = PetscLogFlops(72*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 1007 PetscFunctionReturn(0); 1008 } 1009 1010 #undef __FUNCT__ 1011 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1012 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1013 { 1014 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1015 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1016 MatScalar *v; 1017 PetscErrorCode ierr; 1018 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1019 PetscInt nonzerorow=0; 1020 1021 PetscFunctionBegin; 1022 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1023 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1024 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1025 1026 v = a->a; 1027 xb = x; 1028 1029 for (i=0; i<mbs; i++) { 1030 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1031 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1032 ib = aj + *ai; 1033 jmin = 0; 1034 nonzerorow += (n>0); 1035 if (*ib == i){ /* (diag of A)*x */ 1036 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1037 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1038 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1039 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1040 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1041 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1042 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1043 v += 49; jmin++; 1044 } 1045 for (j=jmin; j<n; j++) { 1046 /* (strict lower triangular part of A)*x */ 1047 cval = ib[j]*7; 1048 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1049 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1050 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1051 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1052 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1053 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1054 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1055 /* (strict upper triangular part of A)*x */ 1056 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1057 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1058 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1059 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1060 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1061 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1062 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1063 v += 49; 1064 } 1065 xb +=7; ai++; 1066 } 1067 1068 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1069 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1070 1071 ierr = PetscLogFlops(98*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 1072 PetscFunctionReturn(0); 1073 } 1074 1075 #undef __FUNCT__ 1076 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1077 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1078 { 1079 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1080 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1081 MatScalar *v; 1082 PetscErrorCode ierr; 1083 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1084 PetscInt nonzerorow=0; 1085 1086 PetscFunctionBegin; 1087 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1088 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1089 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1090 1091 aj = a->j; 1092 v = a->a; 1093 ii = a->i; 1094 1095 if (!a->mult_work) { 1096 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1097 } 1098 work = a->mult_work; 1099 1100 1101 for (i=0; i<mbs; i++) { 1102 n = ii[1] - ii[0]; ncols = n*bs; 1103 workt = work; idx=aj+ii[0]; 1104 nonzerorow += (n>0); 1105 1106 /* upper triangular part */ 1107 for (j=0; j<n; j++) { 1108 xb = x_ptr + bs*(*idx++); 1109 for (k=0; k<bs; k++) workt[k] = xb[k]; 1110 workt += bs; 1111 } 1112 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1113 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1114 1115 /* strict lower triangular part */ 1116 idx = aj+ii[0]; 1117 if (*idx == i){ 1118 ncols -= bs; v += bs2; idx++; n--; 1119 } 1120 if (ncols > 0){ 1121 workt = work; 1122 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1123 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1124 for (j=0; j<n; j++) { 1125 zb = z_ptr + bs*(*idx++); 1126 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1127 workt += bs; 1128 } 1129 } 1130 1131 x += bs; v += n*bs2; z += bs; ii++; 1132 } 1133 1134 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1135 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1136 1137 ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 1138 PetscFunctionReturn(0); 1139 } 1140 1141 #undef __FUNCT__ 1142 #define __FUNCT__ "MatScale_SeqSBAIJ" 1143 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1144 { 1145 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1146 PetscScalar oalpha = alpha; 1147 PetscErrorCode ierr; 1148 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1149 1150 PetscFunctionBegin; 1151 BLASscal_(&totalnz,&oalpha,a->a,&one); 1152 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1153 PetscFunctionReturn(0); 1154 } 1155 1156 #undef __FUNCT__ 1157 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1158 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1159 { 1160 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1161 MatScalar *v = a->a; 1162 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1163 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1164 PetscErrorCode ierr; 1165 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1166 1167 PetscFunctionBegin; 1168 if (type == NORM_FROBENIUS) { 1169 for (k=0; k<mbs; k++){ 1170 jmin = a->i[k]; jmax = a->i[k+1]; 1171 col = aj + jmin; 1172 if (*col == k){ /* diagonal block */ 1173 for (i=0; i<bs2; i++){ 1174 #if defined(PETSC_USE_COMPLEX) 1175 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1176 #else 1177 sum_diag += (*v)*(*v); v++; 1178 #endif 1179 } 1180 jmin++; 1181 } 1182 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1183 for (i=0; i<bs2; i++){ 1184 #if defined(PETSC_USE_COMPLEX) 1185 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1186 #else 1187 sum_off += (*v)*(*v); v++; 1188 #endif 1189 } 1190 } 1191 } 1192 *norm = sqrt(sum_diag + 2*sum_off); 1193 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1194 ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr); 1195 jl = il + mbs; 1196 sum = (PetscReal*)(jl + mbs); 1197 for (i=0; i<mbs; i++) jl[i] = mbs; 1198 il[0] = 0; 1199 1200 *norm = 0.0; 1201 for (k=0; k<mbs; k++) { /* k_th block row */ 1202 for (j=0; j<bs; j++) sum[j]=0.0; 1203 /*-- col sum --*/ 1204 i = jl[k]; /* first |A(i,k)| to be added */ 1205 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1206 at step k */ 1207 while (i<mbs){ 1208 nexti = jl[i]; /* next block row to be added */ 1209 ik = il[i]; /* block index of A(i,k) in the array a */ 1210 for (j=0; j<bs; j++){ 1211 v = a->a + ik*bs2 + j*bs; 1212 for (k1=0; k1<bs; k1++) { 1213 sum[j] += PetscAbsScalar(*v); v++; 1214 } 1215 } 1216 /* update il, jl */ 1217 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1218 jmax = a->i[i+1]; 1219 if (jmin < jmax){ 1220 il[i] = jmin; 1221 j = a->j[jmin]; 1222 jl[i] = jl[j]; jl[j]=i; 1223 } 1224 i = nexti; 1225 } 1226 /*-- row sum --*/ 1227 jmin = a->i[k]; jmax = a->i[k+1]; 1228 for (i=jmin; i<jmax; i++) { 1229 for (j=0; j<bs; j++){ 1230 v = a->a + i*bs2 + j; 1231 for (k1=0; k1<bs; k1++){ 1232 sum[j] += PetscAbsScalar(*v); v += bs; 1233 } 1234 } 1235 } 1236 /* add k_th block row to il, jl */ 1237 col = aj+jmin; 1238 if (*col == k) jmin++; 1239 if (jmin < jmax){ 1240 il[k] = jmin; 1241 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1242 } 1243 for (j=0; j<bs; j++){ 1244 if (sum[j] > *norm) *norm = sum[j]; 1245 } 1246 } 1247 ierr = PetscFree(il);CHKERRQ(ierr); 1248 } else { 1249 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1250 } 1251 PetscFunctionReturn(0); 1252 } 1253 1254 #undef __FUNCT__ 1255 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1256 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1257 { 1258 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1259 PetscErrorCode ierr; 1260 1261 PetscFunctionBegin; 1262 1263 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1264 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1265 *flg = PETSC_FALSE; 1266 PetscFunctionReturn(0); 1267 } 1268 1269 /* if the a->i are the same */ 1270 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1271 if (!*flg) { 1272 PetscFunctionReturn(0); 1273 } 1274 1275 /* if a->j are the same */ 1276 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1277 if (!*flg) { 1278 PetscFunctionReturn(0); 1279 } 1280 /* if a->a are the same */ 1281 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1282 PetscFunctionReturn(0); 1283 } 1284 1285 #undef __FUNCT__ 1286 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1287 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1288 { 1289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1290 PetscErrorCode ierr; 1291 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1292 PetscScalar *x,zero = 0.0; 1293 MatScalar *aa,*aa_j; 1294 1295 PetscFunctionBegin; 1296 bs = A->rmap->bs; 1297 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1298 1299 aa = a->a; 1300 ai = a->i; 1301 aj = a->j; 1302 ambs = a->mbs; 1303 bs2 = a->bs2; 1304 1305 ierr = VecSet(v,zero);CHKERRQ(ierr); 1306 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1307 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1308 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1309 for (i=0; i<ambs; i++) { 1310 j=ai[i]; 1311 if (aj[j] == i) { /* if this is a diagonal element */ 1312 row = i*bs; 1313 aa_j = aa + j*bs2; 1314 if (A->factor && bs==1){ 1315 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1316 } else { 1317 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1318 } 1319 } 1320 } 1321 1322 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1323 PetscFunctionReturn(0); 1324 } 1325 1326 #undef __FUNCT__ 1327 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1328 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1329 { 1330 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1331 PetscScalar *l,x,*li,*ri; 1332 MatScalar *aa,*v; 1333 PetscErrorCode ierr; 1334 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1335 PetscTruth flg; 1336 1337 PetscFunctionBegin; 1338 if (ll != rr){ 1339 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1340 if (!flg) 1341 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1342 } 1343 if (!ll) PetscFunctionReturn(0); 1344 ai = a->i; 1345 aj = a->j; 1346 aa = a->a; 1347 m = A->rmap->N; 1348 bs = A->rmap->bs; 1349 mbs = a->mbs; 1350 bs2 = a->bs2; 1351 1352 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1353 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1354 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1355 for (i=0; i<mbs; i++) { /* for each block row */ 1356 M = ai[i+1] - ai[i]; 1357 li = l + i*bs; 1358 v = aa + bs2*ai[i]; 1359 for (j=0; j<M; j++) { /* for each block */ 1360 ri = l + bs*aj[ai[i]+j]; 1361 for (k=0; k<bs; k++) { 1362 x = ri[k]; 1363 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1364 } 1365 } 1366 } 1367 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1368 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 1369 PetscFunctionReturn(0); 1370 } 1371 1372 #undef __FUNCT__ 1373 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1374 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1375 { 1376 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1377 1378 PetscFunctionBegin; 1379 info->block_size = a->bs2; 1380 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1381 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1382 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1383 info->assemblies = A->num_ass; 1384 info->mallocs = a->reallocs; 1385 info->memory = ((PetscObject)A)->mem; 1386 if (A->factor) { 1387 info->fill_ratio_given = A->info.fill_ratio_given; 1388 info->fill_ratio_needed = A->info.fill_ratio_needed; 1389 info->factor_mallocs = A->info.factor_mallocs; 1390 } else { 1391 info->fill_ratio_given = 0; 1392 info->fill_ratio_needed = 0; 1393 info->factor_mallocs = 0; 1394 } 1395 PetscFunctionReturn(0); 1396 } 1397 1398 1399 #undef __FUNCT__ 1400 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1401 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1402 { 1403 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1404 PetscErrorCode ierr; 1405 1406 PetscFunctionBegin; 1407 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1408 PetscFunctionReturn(0); 1409 } 1410 1411 #undef __FUNCT__ 1412 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1413 /* 1414 This code does not work since it only checks the upper triangular part of 1415 the matrix. Hence it is not listed in the function table. 1416 */ 1417 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1418 { 1419 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1420 PetscErrorCode ierr; 1421 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1422 PetscReal atmp; 1423 MatScalar *aa; 1424 PetscScalar *x; 1425 PetscInt ncols,brow,bcol,krow,kcol; 1426 1427 PetscFunctionBegin; 1428 if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1429 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1430 bs = A->rmap->bs; 1431 aa = a->a; 1432 ai = a->i; 1433 aj = a->j; 1434 mbs = a->mbs; 1435 1436 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1437 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1438 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1439 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1440 for (i=0; i<mbs; i++) { 1441 ncols = ai[1] - ai[0]; ai++; 1442 brow = bs*i; 1443 for (j=0; j<ncols; j++){ 1444 bcol = bs*(*aj); 1445 for (kcol=0; kcol<bs; kcol++){ 1446 col = bcol + kcol; /* col index */ 1447 for (krow=0; krow<bs; krow++){ 1448 atmp = PetscAbsScalar(*aa); aa++; 1449 row = brow + krow; /* row index */ 1450 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1451 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1452 } 1453 } 1454 aj++; 1455 } 1456 } 1457 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1458 PetscFunctionReturn(0); 1459 } 1460