1 #define PETSCMAT_DLL 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table,table0; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap.bs; 24 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table0,bcol)){ 71 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt *irow,nrows,*ssmap,bs=A->rmap.bs,bs2=a->bs2; 102 PetscInt *aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscTruth flag; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 110 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ssmap = smap; 117 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 118 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) { 138 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 } 140 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 141 C = *B; 142 } else { 143 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 144 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 145 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 146 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 147 } 148 c = (Mat_SeqSBAIJ *)(C->data); 149 for (i=0; i<nrows; i++) { 150 row = irow[i]; 151 kstart = ai[row]; 152 kend = kstart + a->ilen[row]; 153 mat_i = c->i[i]; 154 mat_j = c->j + mat_i; 155 mat_a = c->a + mat_i*bs2; 156 mat_ilen = c->ilen + i; 157 for (k=kstart; k<kend; k++) { 158 if ((tcol=ssmap[a->j[k]])) { 159 *mat_j++ = tcol - 1; 160 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 161 mat_a += bs2; 162 (*mat_ilen)++; 163 } 164 } 165 } 166 167 /* Free work space */ 168 ierr = PetscFree(smap);CHKERRQ(ierr); 169 ierr = PetscFree(lens);CHKERRQ(ierr); 170 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 173 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 174 *B = C; 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 181 { 182 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 183 IS is1; 184 PetscErrorCode ierr; 185 PetscInt *vary,*iary,*irow,nrows,i,bs=A->rmap.bs,count; 186 187 PetscFunctionBegin; 188 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 189 190 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 191 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 192 193 /* Verify if the indices corespond to each element in a block 194 and form the IS with compressed IS */ 195 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 196 iary = vary + a->mbs; 197 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 198 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 199 200 count = 0; 201 for (i=0; i<a->mbs; i++) { 202 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 203 if (vary[i]==bs) iary[count++] = i; 204 } 205 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 206 207 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 208 ierr = PetscFree(vary);CHKERRQ(ierr); 209 210 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 211 ISDestroy(is1); 212 PetscFunctionReturn(0); 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 218 { 219 PetscErrorCode ierr; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (scall == MAT_INITIAL_MATRIX) { 224 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 225 } 226 227 for (i=0; i<n; i++) { 228 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 /* -------------------------------------------------------*/ 234 /* Should check that shapes of vectors and matrices match */ 235 /* -------------------------------------------------------*/ 236 #include "petscblaslapack.h" 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 240 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 241 { 242 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 243 PetscScalar *x,*z,*xb,x1,zero=0.0; 244 MatScalar *v; 245 PetscErrorCode ierr; 246 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 247 PetscInt nonzerorow=0; 248 249 PetscFunctionBegin; 250 ierr = VecSet(zz,zero);CHKERRQ(ierr); 251 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 252 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 253 254 v = a->a; 255 xb = x; 256 257 for (i=0; i<mbs; i++) { 258 n = ai[1] - ai[0]; /* length of i_th row of A */ 259 x1 = *xb++; 260 ib = aj + *ai++; 261 jmin = 0; 262 nonzerorow += (n>0); 263 /* if we ALWAYS required a diagonal entry then could remove this if test */ 264 /* should we use a tmp to hold the accumulated z[i] */ 265 if (*ib == i) { /* (diag of A)*x */ 266 z[i] += *v++ * x[*ib++]; 267 jmin++; 268 } 269 for (j=jmin; j<n; j++) { 270 cval = *ib; 271 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 272 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 273 } 274 } 275 276 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 277 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 278 ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); /* nz = (nz+m)/2 */ 279 PetscFunctionReturn(0); 280 } 281 282 #undef __FUNCT__ 283 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 284 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 285 { 286 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 287 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 288 MatScalar *v; 289 PetscErrorCode ierr; 290 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 291 PetscInt nonzerorow=0; 292 293 PetscFunctionBegin; 294 ierr = VecSet(zz,zero);CHKERRQ(ierr); 295 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 296 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 297 298 v = a->a; 299 xb = x; 300 301 for (i=0; i<mbs; i++) { 302 n = ai[1] - ai[0]; /* length of i_th block row of A */ 303 x1 = xb[0]; x2 = xb[1]; 304 ib = aj + *ai; 305 jmin = 0; 306 nonzerorow += (n>0); 307 if (*ib == i){ /* (diag of A)*x */ 308 z[2*i] += v[0]*x1 + v[2]*x2; 309 z[2*i+1] += v[2]*x1 + v[3]*x2; 310 v += 4; jmin++; 311 } 312 for (j=jmin; j<n; j++) { 313 /* (strict lower triangular part of A)*x */ 314 cval = ib[j]*2; 315 z[cval] += v[0]*x1 + v[1]*x2; 316 z[cval+1] += v[2]*x1 + v[3]*x2; 317 /* (strict upper triangular part of A)*x */ 318 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 319 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 320 v += 4; 321 } 322 xb +=2; ai++; 323 } 324 325 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 326 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 327 ierr = PetscLogFlops(8*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 #undef __FUNCT__ 332 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 333 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 334 { 335 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 336 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 337 MatScalar *v; 338 PetscErrorCode ierr; 339 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 340 PetscInt nonzerorow=0; 341 342 PetscFunctionBegin; 343 ierr = VecSet(zz,zero);CHKERRQ(ierr); 344 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 345 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 346 347 v = a->a; 348 xb = x; 349 350 for (i=0; i<mbs; i++) { 351 n = ai[1] - ai[0]; /* length of i_th block row of A */ 352 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 353 ib = aj + *ai; 354 jmin = 0; 355 nonzerorow += (n>0); 356 if (*ib == i){ /* (diag of A)*x */ 357 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 358 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 359 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 360 v += 9; jmin++; 361 } 362 for (j=jmin; j<n; j++) { 363 /* (strict lower triangular part of A)*x */ 364 cval = ib[j]*3; 365 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 366 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 367 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 368 /* (strict upper triangular part of A)*x */ 369 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 370 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 371 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 372 v += 9; 373 } 374 xb +=3; ai++; 375 } 376 377 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 378 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 379 ierr = PetscLogFlops(18*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 380 PetscFunctionReturn(0); 381 } 382 383 #undef __FUNCT__ 384 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 385 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 386 { 387 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 388 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 389 MatScalar *v; 390 PetscErrorCode ierr; 391 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 392 PetscInt nonzerorow=0; 393 394 PetscFunctionBegin; 395 ierr = VecSet(zz,zero);CHKERRQ(ierr); 396 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 397 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 398 399 v = a->a; 400 xb = x; 401 402 for (i=0; i<mbs; i++) { 403 n = ai[1] - ai[0]; /* length of i_th block row of A */ 404 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 405 ib = aj + *ai; 406 jmin = 0; 407 nonzerorow += (n>0); 408 if (*ib == i){ /* (diag of A)*x */ 409 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 410 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 411 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 412 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 413 v += 16; jmin++; 414 } 415 for (j=jmin; j<n; j++) { 416 /* (strict lower triangular part of A)*x */ 417 cval = ib[j]*4; 418 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 419 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 420 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 421 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 422 /* (strict upper triangular part of A)*x */ 423 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 424 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 425 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 426 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 427 v += 16; 428 } 429 xb +=4; ai++; 430 } 431 432 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 433 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 434 ierr = PetscLogFlops(32*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 435 PetscFunctionReturn(0); 436 } 437 438 #undef __FUNCT__ 439 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 440 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 441 { 442 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 443 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 444 MatScalar *v; 445 PetscErrorCode ierr; 446 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 447 PetscInt nonzerorow=0; 448 449 PetscFunctionBegin; 450 ierr = VecSet(zz,zero);CHKERRQ(ierr); 451 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 452 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 453 454 v = a->a; 455 xb = x; 456 457 for (i=0; i<mbs; i++) { 458 n = ai[1] - ai[0]; /* length of i_th block row of A */ 459 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 460 ib = aj + *ai; 461 jmin = 0; 462 nonzerorow += (n>0); 463 if (*ib == i){ /* (diag of A)*x */ 464 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 465 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 466 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 467 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 468 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 469 v += 25; jmin++; 470 } 471 for (j=jmin; j<n; j++) { 472 /* (strict lower triangular part of A)*x */ 473 cval = ib[j]*5; 474 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 475 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 476 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 477 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 478 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 479 /* (strict upper triangular part of A)*x */ 480 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 481 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 482 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 483 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 484 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 485 v += 25; 486 } 487 xb +=5; ai++; 488 } 489 490 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 491 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 492 ierr = PetscLogFlops(50*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 493 PetscFunctionReturn(0); 494 } 495 496 497 #undef __FUNCT__ 498 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 499 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 500 { 501 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 502 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 503 MatScalar *v; 504 PetscErrorCode ierr; 505 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 506 PetscInt nonzerorow=0; 507 508 PetscFunctionBegin; 509 ierr = VecSet(zz,zero);CHKERRQ(ierr); 510 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 511 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 512 513 v = a->a; 514 xb = x; 515 516 for (i=0; i<mbs; i++) { 517 n = ai[1] - ai[0]; /* length of i_th block row of A */ 518 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 519 ib = aj + *ai; 520 jmin = 0; 521 nonzerorow += (n>0); 522 if (*ib == i){ /* (diag of A)*x */ 523 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 524 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 525 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 526 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 527 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 528 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 529 v += 36; jmin++; 530 } 531 for (j=jmin; j<n; j++) { 532 /* (strict lower triangular part of A)*x */ 533 cval = ib[j]*6; 534 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 535 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 536 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 537 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 538 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 539 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 540 /* (strict upper triangular part of A)*x */ 541 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 542 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 543 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 544 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 545 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 546 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 547 v += 36; 548 } 549 xb +=6; ai++; 550 } 551 552 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 553 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 554 ierr = PetscLogFlops(72*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 555 PetscFunctionReturn(0); 556 } 557 #undef __FUNCT__ 558 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 559 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 560 { 561 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 562 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 563 MatScalar *v; 564 PetscErrorCode ierr; 565 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 566 PetscInt nonzerorow=0; 567 568 PetscFunctionBegin; 569 ierr = VecSet(zz,zero);CHKERRQ(ierr); 570 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 571 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 572 573 v = a->a; 574 xb = x; 575 576 for (i=0; i<mbs; i++) { 577 n = ai[1] - ai[0]; /* length of i_th block row of A */ 578 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 579 ib = aj + *ai; 580 jmin = 0; 581 nonzerorow += (n>0); 582 if (*ib == i){ /* (diag of A)*x */ 583 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 584 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 585 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 586 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 587 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 588 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 589 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 590 v += 49; jmin++; 591 } 592 for (j=jmin; j<n; j++) { 593 /* (strict lower triangular part of A)*x */ 594 cval = ib[j]*7; 595 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 596 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 597 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 598 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 599 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 600 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 601 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 602 /* (strict upper triangular part of A)*x */ 603 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 604 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 605 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 606 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 607 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 608 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 609 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 610 v += 49; 611 } 612 xb +=7; ai++; 613 } 614 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 615 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 616 ierr = PetscLogFlops(98*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 617 PetscFunctionReturn(0); 618 } 619 620 /* 621 This will not work with MatScalar == float because it calls the BLAS 622 */ 623 #undef __FUNCT__ 624 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 625 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 626 { 627 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 628 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 629 MatScalar *v; 630 PetscErrorCode ierr; 631 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k; 632 PetscInt nonzerorow=0; 633 634 PetscFunctionBegin; 635 ierr = VecSet(zz,zero);CHKERRQ(ierr); 636 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 637 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 638 639 aj = a->j; 640 v = a->a; 641 ii = a->i; 642 643 if (!a->mult_work) { 644 ierr = PetscMalloc((A->rmap.N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 645 } 646 work = a->mult_work; 647 648 for (i=0; i<mbs; i++) { 649 n = ii[1] - ii[0]; ncols = n*bs; 650 workt = work; idx=aj+ii[0]; 651 nonzerorow += (n>0); 652 653 /* upper triangular part */ 654 for (j=0; j<n; j++) { 655 xb = x_ptr + bs*(*idx++); 656 for (k=0; k<bs; k++) workt[k] = xb[k]; 657 workt += bs; 658 } 659 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 660 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 661 662 /* strict lower triangular part */ 663 idx = aj+ii[0]; 664 if (*idx == i){ 665 ncols -= bs; v += bs2; idx++; n--; 666 } 667 668 if (ncols > 0){ 669 workt = work; 670 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 671 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 672 for (j=0; j<n; j++) { 673 zb = z_ptr + bs*(*idx++); 674 for (k=0; k<bs; k++) zb[k] += workt[k] ; 675 workt += bs; 676 } 677 } 678 x += bs; v += n*bs2; z += bs; ii++; 679 } 680 681 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 682 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 683 ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 684 PetscFunctionReturn(0); 685 } 686 687 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 688 #undef __FUNCT__ 689 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 690 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 691 { 692 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 693 PetscScalar *x,*z,*xb,x1; 694 MatScalar *v; 695 PetscErrorCode ierr; 696 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 697 PetscInt nonzerorow=0; 698 699 PetscFunctionBegin; 700 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 701 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 702 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 703 v = a->a; 704 xb = x; 705 706 for (i=0; i<mbs; i++) { 707 n = ai[1] - ai[0]; /* length of i_th row of A */ 708 x1 = xb[0]; 709 ib = aj + *ai; 710 jmin = 0; 711 nonzerorow += (n>0); 712 if (*ib == i) { /* (diag of A)*x */ 713 z[i] += *v++ * x[*ib++]; jmin++; 714 } 715 for (j=jmin; j<n; j++) { 716 cval = *ib; 717 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 718 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 719 } 720 xb++; ai++; 721 } 722 723 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 724 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 725 726 ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 727 PetscFunctionReturn(0); 728 } 729 730 #undef __FUNCT__ 731 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 732 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 733 { 734 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 735 PetscScalar *x,*z,*xb,x1,x2; 736 MatScalar *v; 737 PetscErrorCode ierr; 738 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 739 PetscInt nonzerorow=0; 740 741 PetscFunctionBegin; 742 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 743 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 744 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 745 746 v = a->a; 747 xb = x; 748 749 for (i=0; i<mbs; i++) { 750 n = ai[1] - ai[0]; /* length of i_th block row of A */ 751 x1 = xb[0]; x2 = xb[1]; 752 ib = aj + *ai; 753 jmin = 0; 754 nonzerorow += (n>0); 755 if (*ib == i){ /* (diag of A)*x */ 756 z[2*i] += v[0]*x1 + v[2]*x2; 757 z[2*i+1] += v[2]*x1 + v[3]*x2; 758 v += 4; jmin++; 759 } 760 for (j=jmin; j<n; j++) { 761 /* (strict lower triangular part of A)*x */ 762 cval = ib[j]*2; 763 z[cval] += v[0]*x1 + v[1]*x2; 764 z[cval+1] += v[2]*x1 + v[3]*x2; 765 /* (strict upper triangular part of A)*x */ 766 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 767 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 768 v += 4; 769 } 770 xb +=2; ai++; 771 } 772 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 773 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 774 775 ierr = PetscLogFlops(4*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 776 PetscFunctionReturn(0); 777 } 778 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 781 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 782 { 783 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 784 PetscScalar *x,*z,*xb,x1,x2,x3; 785 MatScalar *v; 786 PetscErrorCode ierr; 787 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 788 PetscInt nonzerorow=0; 789 790 PetscFunctionBegin; 791 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 793 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 794 795 v = a->a; 796 xb = x; 797 798 for (i=0; i<mbs; i++) { 799 n = ai[1] - ai[0]; /* length of i_th block row of A */ 800 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 801 ib = aj + *ai; 802 jmin = 0; 803 nonzerorow += (n>0); 804 if (*ib == i){ /* (diag of A)*x */ 805 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 806 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 807 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 808 v += 9; jmin++; 809 } 810 for (j=jmin; j<n; j++) { 811 /* (strict lower triangular part of A)*x */ 812 cval = ib[j]*3; 813 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 814 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 815 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 816 /* (strict upper triangular part of A)*x */ 817 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 818 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 819 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 820 v += 9; 821 } 822 xb +=3; ai++; 823 } 824 825 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 826 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 827 828 ierr = PetscLogFlops(18*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 829 PetscFunctionReturn(0); 830 } 831 832 #undef __FUNCT__ 833 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 834 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 835 { 836 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 837 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 838 MatScalar *v; 839 PetscErrorCode ierr; 840 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 841 PetscInt nonzerorow=0; 842 843 PetscFunctionBegin; 844 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 845 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 846 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 847 848 v = a->a; 849 xb = x; 850 851 for (i=0; i<mbs; i++) { 852 n = ai[1] - ai[0]; /* length of i_th block row of A */ 853 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 854 ib = aj + *ai; 855 jmin = 0; 856 nonzerorow += (n>0); 857 if (*ib == i){ /* (diag of A)*x */ 858 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 859 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 860 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 861 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 862 v += 16; jmin++; 863 } 864 for (j=jmin; j<n; j++) { 865 /* (strict lower triangular part of A)*x */ 866 cval = ib[j]*4; 867 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 868 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 869 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 870 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 871 /* (strict upper triangular part of A)*x */ 872 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 873 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 874 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 875 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 876 v += 16; 877 } 878 xb +=4; ai++; 879 } 880 881 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 882 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 883 884 ierr = PetscLogFlops(32*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 885 PetscFunctionReturn(0); 886 } 887 888 #undef __FUNCT__ 889 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 890 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 891 { 892 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 893 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 894 MatScalar *v; 895 PetscErrorCode ierr; 896 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 897 PetscInt nonzerorow=0; 898 899 PetscFunctionBegin; 900 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 901 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 902 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 903 904 v = a->a; 905 xb = x; 906 907 for (i=0; i<mbs; i++) { 908 n = ai[1] - ai[0]; /* length of i_th block row of A */ 909 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 910 ib = aj + *ai; 911 jmin = 0; 912 nonzerorow += (n>0); 913 if (*ib == i){ /* (diag of A)*x */ 914 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 915 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 916 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 917 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 918 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 919 v += 25; jmin++; 920 } 921 for (j=jmin; j<n; j++) { 922 /* (strict lower triangular part of A)*x */ 923 cval = ib[j]*5; 924 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 925 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 926 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 927 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 928 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 929 /* (strict upper triangular part of A)*x */ 930 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 931 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 932 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 933 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 934 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 935 v += 25; 936 } 937 xb +=5; ai++; 938 } 939 940 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 941 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 942 943 ierr = PetscLogFlops(50*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 944 PetscFunctionReturn(0); 945 } 946 #undef __FUNCT__ 947 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 948 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 949 { 950 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 951 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 952 MatScalar *v; 953 PetscErrorCode ierr; 954 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 955 PetscInt nonzerorow=0; 956 957 PetscFunctionBegin; 958 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 959 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 960 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 961 962 v = a->a; 963 xb = x; 964 965 for (i=0; i<mbs; i++) { 966 n = ai[1] - ai[0]; /* length of i_th block row of A */ 967 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 968 ib = aj + *ai; 969 jmin = 0; 970 nonzerorow += (n>0); 971 if (*ib == i){ /* (diag of A)*x */ 972 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 973 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 974 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 975 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 976 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 977 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 978 v += 36; jmin++; 979 } 980 for (j=jmin; j<n; j++) { 981 /* (strict lower triangular part of A)*x */ 982 cval = ib[j]*6; 983 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 984 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 985 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 986 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 987 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 988 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 989 /* (strict upper triangular part of A)*x */ 990 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 991 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 992 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 993 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 994 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 995 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 996 v += 36; 997 } 998 xb +=6; ai++; 999 } 1000 1001 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1002 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1003 1004 ierr = PetscLogFlops(72*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 1005 PetscFunctionReturn(0); 1006 } 1007 1008 #undef __FUNCT__ 1009 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1010 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1011 { 1012 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1013 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1014 MatScalar *v; 1015 PetscErrorCode ierr; 1016 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1017 PetscInt nonzerorow=0; 1018 1019 PetscFunctionBegin; 1020 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1023 1024 v = a->a; 1025 xb = x; 1026 1027 for (i=0; i<mbs; i++) { 1028 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1029 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1030 ib = aj + *ai; 1031 jmin = 0; 1032 nonzerorow += (n>0); 1033 if (*ib == i){ /* (diag of A)*x */ 1034 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1035 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1036 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1037 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1038 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1039 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1040 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1041 v += 49; jmin++; 1042 } 1043 for (j=jmin; j<n; j++) { 1044 /* (strict lower triangular part of A)*x */ 1045 cval = ib[j]*7; 1046 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1047 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1048 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1049 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1050 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1051 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1052 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1053 /* (strict upper triangular part of A)*x */ 1054 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1055 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1056 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1057 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1058 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1059 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1060 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1061 v += 49; 1062 } 1063 xb +=7; ai++; 1064 } 1065 1066 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1067 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1068 1069 ierr = PetscLogFlops(98*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 1070 PetscFunctionReturn(0); 1071 } 1072 1073 #undef __FUNCT__ 1074 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1075 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1076 { 1077 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1078 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1079 MatScalar *v; 1080 PetscErrorCode ierr; 1081 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k; 1082 PetscInt nonzerorow=0; 1083 1084 PetscFunctionBegin; 1085 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1086 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1087 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1088 1089 aj = a->j; 1090 v = a->a; 1091 ii = a->i; 1092 1093 if (!a->mult_work) { 1094 ierr = PetscMalloc((A->rmap.n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1095 } 1096 work = a->mult_work; 1097 1098 1099 for (i=0; i<mbs; i++) { 1100 n = ii[1] - ii[0]; ncols = n*bs; 1101 workt = work; idx=aj+ii[0]; 1102 nonzerorow += (n>0); 1103 1104 /* upper triangular part */ 1105 for (j=0; j<n; j++) { 1106 xb = x_ptr + bs*(*idx++); 1107 for (k=0; k<bs; k++) workt[k] = xb[k]; 1108 workt += bs; 1109 } 1110 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1111 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1112 1113 /* strict lower triangular part */ 1114 idx = aj+ii[0]; 1115 if (*idx == i){ 1116 ncols -= bs; v += bs2; idx++; n--; 1117 } 1118 if (ncols > 0){ 1119 workt = work; 1120 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1121 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1122 for (j=0; j<n; j++) { 1123 zb = z_ptr + bs*(*idx++); 1124 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1125 workt += bs; 1126 } 1127 } 1128 1129 x += bs; v += n*bs2; z += bs; ii++; 1130 } 1131 1132 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1133 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1134 1135 ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow));CHKERRQ(ierr); 1136 PetscFunctionReturn(0); 1137 } 1138 1139 #undef __FUNCT__ 1140 #define __FUNCT__ "MatScale_SeqSBAIJ" 1141 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1142 { 1143 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1144 PetscScalar oalpha = alpha; 1145 PetscErrorCode ierr; 1146 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1147 1148 PetscFunctionBegin; 1149 BLASscal_(&totalnz,&oalpha,a->a,&one); 1150 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1151 PetscFunctionReturn(0); 1152 } 1153 1154 #undef __FUNCT__ 1155 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1156 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1157 { 1158 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1159 MatScalar *v = a->a; 1160 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1161 PetscInt i,j,k,bs = A->rmap.bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1162 PetscErrorCode ierr; 1163 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1164 1165 PetscFunctionBegin; 1166 if (type == NORM_FROBENIUS) { 1167 for (k=0; k<mbs; k++){ 1168 jmin = a->i[k]; jmax = a->i[k+1]; 1169 col = aj + jmin; 1170 if (*col == k){ /* diagonal block */ 1171 for (i=0; i<bs2; i++){ 1172 #if defined(PETSC_USE_COMPLEX) 1173 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1174 #else 1175 sum_diag += (*v)*(*v); v++; 1176 #endif 1177 } 1178 jmin++; 1179 } 1180 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1181 for (i=0; i<bs2; i++){ 1182 #if defined(PETSC_USE_COMPLEX) 1183 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1184 #else 1185 sum_off += (*v)*(*v); v++; 1186 #endif 1187 } 1188 } 1189 } 1190 *norm = sqrt(sum_diag + 2*sum_off); 1191 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1192 ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr); 1193 jl = il + mbs; 1194 sum = (PetscReal*)(jl + mbs); 1195 for (i=0; i<mbs; i++) jl[i] = mbs; 1196 il[0] = 0; 1197 1198 *norm = 0.0; 1199 for (k=0; k<mbs; k++) { /* k_th block row */ 1200 for (j=0; j<bs; j++) sum[j]=0.0; 1201 /*-- col sum --*/ 1202 i = jl[k]; /* first |A(i,k)| to be added */ 1203 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1204 at step k */ 1205 while (i<mbs){ 1206 nexti = jl[i]; /* next block row to be added */ 1207 ik = il[i]; /* block index of A(i,k) in the array a */ 1208 for (j=0; j<bs; j++){ 1209 v = a->a + ik*bs2 + j*bs; 1210 for (k1=0; k1<bs; k1++) { 1211 sum[j] += PetscAbsScalar(*v); v++; 1212 } 1213 } 1214 /* update il, jl */ 1215 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1216 jmax = a->i[i+1]; 1217 if (jmin < jmax){ 1218 il[i] = jmin; 1219 j = a->j[jmin]; 1220 jl[i] = jl[j]; jl[j]=i; 1221 } 1222 i = nexti; 1223 } 1224 /*-- row sum --*/ 1225 jmin = a->i[k]; jmax = a->i[k+1]; 1226 for (i=jmin; i<jmax; i++) { 1227 for (j=0; j<bs; j++){ 1228 v = a->a + i*bs2 + j; 1229 for (k1=0; k1<bs; k1++){ 1230 sum[j] += PetscAbsScalar(*v); v += bs; 1231 } 1232 } 1233 } 1234 /* add k_th block row to il, jl */ 1235 col = aj+jmin; 1236 if (*col == k) jmin++; 1237 if (jmin < jmax){ 1238 il[k] = jmin; 1239 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1240 } 1241 for (j=0; j<bs; j++){ 1242 if (sum[j] > *norm) *norm = sum[j]; 1243 } 1244 } 1245 ierr = PetscFree(il);CHKERRQ(ierr); 1246 } else { 1247 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1248 } 1249 PetscFunctionReturn(0); 1250 } 1251 1252 #undef __FUNCT__ 1253 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1254 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1255 { 1256 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1257 PetscErrorCode ierr; 1258 1259 PetscFunctionBegin; 1260 1261 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1262 if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) { 1263 *flg = PETSC_FALSE; 1264 PetscFunctionReturn(0); 1265 } 1266 1267 /* if the a->i are the same */ 1268 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1269 if (!*flg) { 1270 PetscFunctionReturn(0); 1271 } 1272 1273 /* if a->j are the same */ 1274 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1275 if (!*flg) { 1276 PetscFunctionReturn(0); 1277 } 1278 /* if a->a are the same */ 1279 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(A->rmap.bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1280 PetscFunctionReturn(0); 1281 } 1282 1283 #undef __FUNCT__ 1284 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1285 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1286 { 1287 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1288 PetscErrorCode ierr; 1289 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1290 PetscScalar *x,zero = 0.0; 1291 MatScalar *aa,*aa_j; 1292 1293 PetscFunctionBegin; 1294 bs = A->rmap.bs; 1295 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1296 1297 aa = a->a; 1298 ai = a->i; 1299 aj = a->j; 1300 ambs = a->mbs; 1301 bs2 = a->bs2; 1302 1303 ierr = VecSet(v,zero);CHKERRQ(ierr); 1304 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1305 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1306 if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1307 for (i=0; i<ambs; i++) { 1308 j=ai[i]; 1309 if (aj[j] == i) { /* if this is a diagonal element */ 1310 row = i*bs; 1311 aa_j = aa + j*bs2; 1312 if (A->factor && bs==1){ 1313 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1314 } else { 1315 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1316 } 1317 } 1318 } 1319 1320 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1321 PetscFunctionReturn(0); 1322 } 1323 1324 #undef __FUNCT__ 1325 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1326 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1327 { 1328 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1329 PetscScalar *l,x,*li,*ri; 1330 MatScalar *aa,*v; 1331 PetscErrorCode ierr; 1332 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1333 PetscTruth flg; 1334 1335 PetscFunctionBegin; 1336 if (ll != rr){ 1337 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1338 if (!flg) 1339 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1340 } 1341 if (!ll) PetscFunctionReturn(0); 1342 ai = a->i; 1343 aj = a->j; 1344 aa = a->a; 1345 m = A->rmap.N; 1346 bs = A->rmap.bs; 1347 mbs = a->mbs; 1348 bs2 = a->bs2; 1349 1350 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1351 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1352 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1353 for (i=0; i<mbs; i++) { /* for each block row */ 1354 M = ai[i+1] - ai[i]; 1355 li = l + i*bs; 1356 v = aa + bs2*ai[i]; 1357 for (j=0; j<M; j++) { /* for each block */ 1358 ri = l + bs*aj[ai[i]+j]; 1359 for (k=0; k<bs; k++) { 1360 x = ri[k]; 1361 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1362 } 1363 } 1364 } 1365 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1366 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 1367 PetscFunctionReturn(0); 1368 } 1369 1370 #undef __FUNCT__ 1371 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1372 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1373 { 1374 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1375 1376 PetscFunctionBegin; 1377 info->rows_global = (double)A->rmap.N; 1378 info->columns_global = (double)A->rmap.N; 1379 info->rows_local = (double)A->rmap.N; 1380 info->columns_local = (double)A->rmap.N; 1381 info->block_size = a->bs2; 1382 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1383 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1384 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1385 info->assemblies = A->num_ass; 1386 info->mallocs = a->reallocs; 1387 info->memory = ((PetscObject)A)->mem; 1388 if (A->factor) { 1389 info->fill_ratio_given = A->info.fill_ratio_given; 1390 info->fill_ratio_needed = A->info.fill_ratio_needed; 1391 info->factor_mallocs = A->info.factor_mallocs; 1392 } else { 1393 info->fill_ratio_given = 0; 1394 info->fill_ratio_needed = 0; 1395 info->factor_mallocs = 0; 1396 } 1397 PetscFunctionReturn(0); 1398 } 1399 1400 1401 #undef __FUNCT__ 1402 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1403 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1404 { 1405 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1406 PetscErrorCode ierr; 1407 1408 PetscFunctionBegin; 1409 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1410 PetscFunctionReturn(0); 1411 } 1412 1413 #undef __FUNCT__ 1414 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1415 /* 1416 This code does not work since it only checks the upper triangular part of 1417 the matrix. Hence it is not listed in the function table. 1418 */ 1419 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1420 { 1421 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1422 PetscErrorCode ierr; 1423 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1424 PetscReal atmp; 1425 MatScalar *aa; 1426 PetscScalar *x; 1427 PetscInt ncols,brow,bcol,krow,kcol; 1428 1429 PetscFunctionBegin; 1430 if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1431 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1432 bs = A->rmap.bs; 1433 aa = a->a; 1434 ai = a->i; 1435 aj = a->j; 1436 mbs = a->mbs; 1437 1438 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1439 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1440 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1441 if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1442 for (i=0; i<mbs; i++) { 1443 ncols = ai[1] - ai[0]; ai++; 1444 brow = bs*i; 1445 for (j=0; j<ncols; j++){ 1446 bcol = bs*(*aj); 1447 for (kcol=0; kcol<bs; kcol++){ 1448 col = bcol + kcol; /* col index */ 1449 for (krow=0; krow<bs; krow++){ 1450 atmp = PetscAbsScalar(*aa); aa++; 1451 row = brow + krow; /* row index */ 1452 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1453 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1454 } 1455 } 1456 aj++; 1457 } 1458 } 1459 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1460 PetscFunctionReturn(0); 1461 } 1462