1 2 #include "src/mat/impls/baij/seq/baij.h" 3 #include "src/inline/spops.h" 4 #include "src/inline/ilu.h" 5 #include "petscbt.h" 6 #include "src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 int brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max, 15 start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 PetscBT table0; 18 19 PetscFunctionBegin; 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = a->bs; 24 25 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 28 ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 29 ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr); 30 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 31 32 for (i=0; i<is_max; i++) { /* for each is */ 33 isz = 0; 34 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 38 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 39 40 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 41 bcol_max = 0; 42 for (j=0; j<n ; ++j){ 43 brow = idx[j]/bs; /* convert the indices into block indices */ 44 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 45 if(!PetscBTLookupSet(table,brow)) { 46 nidx[isz++] = brow; 47 if (bcol_max < brow) bcol_max = brow; 48 } 49 } 50 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 51 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 52 53 k = 0; 54 for (j=0; j<ov; j++){ /* for each overlap */ 55 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 56 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 57 for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]); 58 59 n = isz; /* length of the updated is[i] */ 60 for (brow=0; brow<mbs; brow++){ 61 start = ai[brow]; end = ai[brow+1]; 62 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 63 for (l = start; l<end ; l++){ 64 bcol = aj[l]; 65 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 66 } 67 k++; 68 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70 for (l = start; l<end ; l++){ 71 bcol = aj[l]; 72 if (bcol > bcol_max) break; 73 if (PetscBTLookup(table0,bcol)){ 74 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 75 break; /* for l = start; l<end ; l++) */ 76 } 77 } 78 } 79 } 80 } /* for each overlap */ 81 82 /* expand the Index Set */ 83 for (j=0; j<isz; j++) { 84 for (k=0; k<bs; k++) 85 nidx2[j*bs+k] = nidx[j]*bs+k; 86 } 87 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 88 } 89 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 90 ierr = PetscFree(nidx);CHKERRQ(ierr); 91 ierr = PetscFree(nidx2);CHKERRQ(ierr); 92 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 93 PetscFunctionReturn(0); 94 } 95 96 #undef __FUNCT__ 97 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 98 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 99 { 100 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 101 PetscErrorCode ierr; 102 int *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 103 int row,mat_i,*mat_j,tcol,*mat_ilen; 104 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 105 int *aj = a->j,*ai = a->i; 106 MatScalar *mat_a; 107 Mat C; 108 PetscTruth flag; 109 110 PetscFunctionBegin; 111 112 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 113 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 114 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 115 116 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 117 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 118 119 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 120 ssmap = smap; 121 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 122 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 123 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 124 /* determine lens of each row */ 125 for (i=0; i<nrows; i++) { 126 kstart = ai[irow[i]]; 127 kend = kstart + a->ilen[irow[i]]; 128 lens[i] = 0; 129 for (k=kstart; k<kend; k++) { 130 if (ssmap[aj[k]]) { 131 lens[i]++; 132 } 133 } 134 } 135 /* Create and fill new matrix */ 136 if (scall == MAT_REUSE_MATRIX) { 137 c = (Mat_SeqSBAIJ *)((*B)->data); 138 139 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 140 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 141 if (flag == PETSC_FALSE) { 142 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 143 } 144 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 145 C = *B; 146 } else { 147 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 148 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 149 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 150 } 151 c = (Mat_SeqSBAIJ *)(C->data); 152 for (i=0; i<nrows; i++) { 153 row = irow[i]; 154 kstart = ai[row]; 155 kend = kstart + a->ilen[row]; 156 mat_i = c->i[i]; 157 mat_j = c->j + mat_i; 158 mat_a = c->a + mat_i*bs2; 159 mat_ilen = c->ilen + i; 160 for (k=kstart; k<kend; k++) { 161 if ((tcol=ssmap[a->j[k]])) { 162 *mat_j++ = tcol - 1; 163 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 164 mat_a += bs2; 165 (*mat_ilen)++; 166 } 167 } 168 } 169 170 /* Free work space */ 171 ierr = PetscFree(smap);CHKERRQ(ierr); 172 ierr = PetscFree(lens);CHKERRQ(ierr); 173 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 174 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 175 176 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 177 *B = C; 178 PetscFunctionReturn(0); 179 } 180 181 #undef __FUNCT__ 182 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 183 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 184 { 185 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 186 IS is1; 187 PetscErrorCode ierr; 188 int *vary,*iary,*irow,nrows,i,bs=a->bs,count; 189 190 PetscFunctionBegin; 191 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 192 193 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 194 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 195 196 /* Verify if the indices corespond to each element in a block 197 and form the IS with compressed IS */ 198 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 199 iary = vary + a->mbs; 200 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 201 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 202 203 count = 0; 204 for (i=0; i<a->mbs; i++) { 205 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 206 if (vary[i]==bs) iary[count++] = i; 207 } 208 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 209 210 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 211 ierr = PetscFree(vary);CHKERRQ(ierr); 212 213 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 214 ISDestroy(is1); 215 PetscFunctionReturn(0); 216 } 217 218 #undef __FUNCT__ 219 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 220 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 221 { 222 PetscErrorCode ierr; 223 int i; 224 225 PetscFunctionBegin; 226 if (scall == MAT_INITIAL_MATRIX) { 227 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 228 } 229 230 for (i=0; i<n; i++) { 231 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 232 } 233 PetscFunctionReturn(0); 234 } 235 236 /* -------------------------------------------------------*/ 237 /* Should check that shapes of vectors and matrices match */ 238 /* -------------------------------------------------------*/ 239 #include "petscblaslapack.h" 240 241 #undef __FUNCT__ 242 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 243 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 244 { 245 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 246 PetscScalar *x,*z,*xb,x1,zero=0.0; 247 MatScalar *v; 248 PetscErrorCode ierr; 249 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 250 251 PetscFunctionBegin; 252 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 253 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 254 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 255 256 v = a->a; 257 xb = x; 258 259 for (i=0; i<mbs; i++) { 260 n = ai[1] - ai[0]; /* length of i_th row of A */ 261 x1 = xb[0]; 262 ib = aj + *ai; 263 jmin = 0; 264 if (*ib == i) { /* (diag of A)*x */ 265 z[i] += *v++ * x[*ib++]; 266 jmin++; 267 } 268 for (j=jmin; j<n; j++) { 269 cval = *ib; 270 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 271 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 272 } 273 xb++; ai++; 274 } 275 276 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 277 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 278 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 279 PetscFunctionReturn(0); 280 } 281 282 #undef __FUNCT__ 283 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 284 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 285 { 286 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 287 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 288 MatScalar *v; 289 PetscErrorCode ierr; 290 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 291 292 293 PetscFunctionBegin; 294 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 295 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 296 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 297 298 v = a->a; 299 xb = x; 300 301 for (i=0; i<mbs; i++) { 302 n = ai[1] - ai[0]; /* length of i_th block row of A */ 303 x1 = xb[0]; x2 = xb[1]; 304 ib = aj + *ai; 305 jmin = 0; 306 if (*ib == i){ /* (diag of A)*x */ 307 z[2*i] += v[0]*x1 + v[2]*x2; 308 z[2*i+1] += v[2]*x1 + v[3]*x2; 309 v += 4; jmin++; 310 } 311 for (j=jmin; j<n; j++) { 312 /* (strict lower triangular part of A)*x */ 313 cval = ib[j]*2; 314 z[cval] += v[0]*x1 + v[1]*x2; 315 z[cval+1] += v[2]*x1 + v[3]*x2; 316 /* (strict upper triangular part of A)*x */ 317 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 318 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 319 v += 4; 320 } 321 xb +=2; ai++; 322 } 323 324 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 325 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 326 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 327 PetscFunctionReturn(0); 328 } 329 330 #undef __FUNCT__ 331 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 332 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 333 { 334 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 335 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 336 MatScalar *v; 337 PetscErrorCode ierr; 338 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 339 340 341 PetscFunctionBegin; 342 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 343 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 344 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 345 346 v = a->a; 347 xb = x; 348 349 for (i=0; i<mbs; i++) { 350 n = ai[1] - ai[0]; /* length of i_th block row of A */ 351 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 352 ib = aj + *ai; 353 jmin = 0; 354 if (*ib == i){ /* (diag of A)*x */ 355 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 356 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 357 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 358 v += 9; jmin++; 359 } 360 for (j=jmin; j<n; j++) { 361 /* (strict lower triangular part of A)*x */ 362 cval = ib[j]*3; 363 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 364 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 365 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 366 /* (strict upper triangular part of A)*x */ 367 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 368 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 369 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 370 v += 9; 371 } 372 xb +=3; ai++; 373 } 374 375 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 376 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 377 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 378 PetscFunctionReturn(0); 379 } 380 381 #undef __FUNCT__ 382 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 383 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 384 { 385 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 386 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 387 MatScalar *v; 388 PetscErrorCode ierr; 389 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 390 391 PetscFunctionBegin; 392 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 393 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 394 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 395 396 v = a->a; 397 xb = x; 398 399 for (i=0; i<mbs; i++) { 400 n = ai[1] - ai[0]; /* length of i_th block row of A */ 401 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 402 ib = aj + *ai; 403 jmin = 0; 404 if (*ib == i){ /* (diag of A)*x */ 405 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 406 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 407 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 408 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 409 v += 16; jmin++; 410 } 411 for (j=jmin; j<n; j++) { 412 /* (strict lower triangular part of A)*x */ 413 cval = ib[j]*4; 414 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 415 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 416 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 417 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 418 /* (strict upper triangular part of A)*x */ 419 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 420 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 421 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 422 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 423 v += 16; 424 } 425 xb +=4; ai++; 426 } 427 428 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 429 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 430 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 431 PetscFunctionReturn(0); 432 } 433 434 #undef __FUNCT__ 435 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 436 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 437 { 438 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 439 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 440 MatScalar *v; 441 PetscErrorCode ierr; 442 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 443 444 PetscFunctionBegin; 445 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 446 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 447 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 448 449 v = a->a; 450 xb = x; 451 452 for (i=0; i<mbs; i++) { 453 n = ai[1] - ai[0]; /* length of i_th block row of A */ 454 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 455 ib = aj + *ai; 456 jmin = 0; 457 if (*ib == i){ /* (diag of A)*x */ 458 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 459 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 460 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 461 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 462 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 463 v += 25; jmin++; 464 } 465 for (j=jmin; j<n; j++) { 466 /* (strict lower triangular part of A)*x */ 467 cval = ib[j]*5; 468 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 469 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 470 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 471 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 472 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 473 /* (strict upper triangular part of A)*x */ 474 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 475 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 476 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 477 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 478 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 479 v += 25; 480 } 481 xb +=5; ai++; 482 } 483 484 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 485 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 486 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 487 PetscFunctionReturn(0); 488 } 489 490 491 #undef __FUNCT__ 492 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 493 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 494 { 495 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 496 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 497 MatScalar *v; 498 PetscErrorCode ierr; 499 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 500 501 PetscFunctionBegin; 502 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 503 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 504 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 505 506 v = a->a; 507 xb = x; 508 509 for (i=0; i<mbs; i++) { 510 n = ai[1] - ai[0]; /* length of i_th block row of A */ 511 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 512 ib = aj + *ai; 513 jmin = 0; 514 if (*ib == i){ /* (diag of A)*x */ 515 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 516 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 517 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 518 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 519 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 520 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 521 v += 36; jmin++; 522 } 523 for (j=jmin; j<n; j++) { 524 /* (strict lower triangular part of A)*x */ 525 cval = ib[j]*6; 526 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 527 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 528 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 529 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 530 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 531 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 532 /* (strict upper triangular part of A)*x */ 533 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 534 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 535 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 536 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 537 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 538 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 539 v += 36; 540 } 541 xb +=6; ai++; 542 } 543 544 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 545 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 546 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 547 PetscFunctionReturn(0); 548 } 549 #undef __FUNCT__ 550 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 551 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 552 { 553 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 554 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 555 MatScalar *v; 556 PetscErrorCode ierr; 557 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 558 559 PetscFunctionBegin; 560 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 561 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 562 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 563 564 v = a->a; 565 xb = x; 566 567 for (i=0; i<mbs; i++) { 568 n = ai[1] - ai[0]; /* length of i_th block row of A */ 569 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 570 ib = aj + *ai; 571 jmin = 0; 572 if (*ib == i){ /* (diag of A)*x */ 573 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 574 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 575 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 576 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 577 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 578 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 579 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 580 v += 49; jmin++; 581 } 582 for (j=jmin; j<n; j++) { 583 /* (strict lower triangular part of A)*x */ 584 cval = ib[j]*7; 585 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 586 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 587 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 588 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 589 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 590 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 591 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 592 /* (strict upper triangular part of A)*x */ 593 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 594 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 595 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 596 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 597 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 598 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 599 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 600 v += 49; 601 } 602 xb +=7; ai++; 603 } 604 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 605 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 606 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 607 PetscFunctionReturn(0); 608 } 609 610 /* 611 This will not work with MatScalar == float because it calls the BLAS 612 */ 613 #undef __FUNCT__ 614 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 615 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 616 { 617 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 618 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 619 MatScalar *v; 620 PetscErrorCode ierr; 621 int mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 622 int ncols,k; 623 624 PetscFunctionBegin; 625 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 626 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 627 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 628 629 aj = a->j; 630 v = a->a; 631 ii = a->i; 632 633 if (!a->mult_work) { 634 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 635 } 636 work = a->mult_work; 637 638 for (i=0; i<mbs; i++) { 639 n = ii[1] - ii[0]; ncols = n*bs; 640 workt = work; idx=aj+ii[0]; 641 642 /* upper triangular part */ 643 for (j=0; j<n; j++) { 644 xb = x_ptr + bs*(*idx++); 645 for (k=0; k<bs; k++) workt[k] = xb[k]; 646 workt += bs; 647 } 648 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 649 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 650 651 /* strict lower triangular part */ 652 idx = aj+ii[0]; 653 if (*idx == i){ 654 ncols -= bs; v += bs2; idx++; n--; 655 } 656 657 if (ncols > 0){ 658 workt = work; 659 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 660 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 661 for (j=0; j<n; j++) { 662 zb = z_ptr + bs*(*idx++); 663 for (k=0; k<bs; k++) zb[k] += workt[k] ; 664 workt += bs; 665 } 666 } 667 x += bs; v += n*bs2; z += bs; ii++; 668 } 669 670 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 671 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 672 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 673 PetscFunctionReturn(0); 674 } 675 676 #undef __FUNCT__ 677 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 678 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 679 { 680 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 681 PetscScalar *x,*y,*z,*xb,x1; 682 MatScalar *v; 683 PetscErrorCode ierr; 684 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 685 686 PetscFunctionBegin; 687 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 688 if (yy != xx) { 689 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 690 } else { 691 y = x; 692 } 693 if (zz != yy) { 694 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 695 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 696 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 697 } else { 698 z = y; 699 } 700 701 v = a->a; 702 xb = x; 703 704 for (i=0; i<mbs; i++) { 705 n = ai[1] - ai[0]; /* length of i_th row of A */ 706 x1 = xb[0]; 707 ib = aj + *ai; 708 jmin = 0; 709 if (*ib == i) { /* (diag of A)*x */ 710 z[i] += *v++ * x[*ib++]; jmin++; 711 } 712 for (j=jmin; j<n; j++) { 713 cval = *ib; 714 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 715 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 716 } 717 xb++; ai++; 718 } 719 720 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 721 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 722 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 723 724 PetscLogFlops(2*(a->nz*2 - A->m)); 725 PetscFunctionReturn(0); 726 } 727 728 #undef __FUNCT__ 729 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 730 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 731 { 732 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 733 PetscScalar *x,*y,*z,*xb,x1,x2; 734 MatScalar *v; 735 PetscErrorCode ierr; 736 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 737 738 PetscFunctionBegin; 739 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 740 if (yy != xx) { 741 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 742 } else { 743 y = x; 744 } 745 if (zz != yy) { 746 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 747 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 748 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 749 } else { 750 z = y; 751 } 752 753 v = a->a; 754 xb = x; 755 756 for (i=0; i<mbs; i++) { 757 n = ai[1] - ai[0]; /* length of i_th block row of A */ 758 x1 = xb[0]; x2 = xb[1]; 759 ib = aj + *ai; 760 jmin = 0; 761 if (*ib == i){ /* (diag of A)*x */ 762 z[2*i] += v[0]*x1 + v[2]*x2; 763 z[2*i+1] += v[2]*x1 + v[3]*x2; 764 v += 4; jmin++; 765 } 766 for (j=jmin; j<n; j++) { 767 /* (strict lower triangular part of A)*x */ 768 cval = ib[j]*2; 769 z[cval] += v[0]*x1 + v[1]*x2; 770 z[cval+1] += v[2]*x1 + v[3]*x2; 771 /* (strict upper triangular part of A)*x */ 772 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 773 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 774 v += 4; 775 } 776 xb +=2; ai++; 777 } 778 779 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 780 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 781 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 782 783 PetscLogFlops(4*(a->nz*2 - A->m)); 784 PetscFunctionReturn(0); 785 } 786 787 #undef __FUNCT__ 788 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 789 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 790 { 791 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 792 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 793 MatScalar *v; 794 PetscErrorCode ierr; 795 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 796 797 PetscFunctionBegin; 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 if (yy != xx) { 800 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 801 } else { 802 y = x; 803 } 804 if (zz != yy) { 805 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 806 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 807 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 808 } else { 809 z = y; 810 } 811 812 v = a->a; 813 xb = x; 814 815 for (i=0; i<mbs; i++) { 816 n = ai[1] - ai[0]; /* length of i_th block row of A */ 817 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 818 ib = aj + *ai; 819 jmin = 0; 820 if (*ib == i){ /* (diag of A)*x */ 821 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 822 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 823 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 824 v += 9; jmin++; 825 } 826 for (j=jmin; j<n; j++) { 827 /* (strict lower triangular part of A)*x */ 828 cval = ib[j]*3; 829 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 830 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 831 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 832 /* (strict upper triangular part of A)*x */ 833 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 834 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 835 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 836 v += 9; 837 } 838 xb +=3; ai++; 839 } 840 841 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 842 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 843 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 844 845 PetscLogFlops(18*(a->nz*2 - A->m)); 846 PetscFunctionReturn(0); 847 } 848 849 #undef __FUNCT__ 850 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 851 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 852 { 853 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 854 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 855 MatScalar *v; 856 PetscErrorCode ierr; 857 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 858 859 PetscFunctionBegin; 860 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 861 if (yy != xx) { 862 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 863 } else { 864 y = x; 865 } 866 if (zz != yy) { 867 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 868 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 869 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 870 } else { 871 z = y; 872 } 873 874 v = a->a; 875 xb = x; 876 877 for (i=0; i<mbs; i++) { 878 n = ai[1] - ai[0]; /* length of i_th block row of A */ 879 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 880 ib = aj + *ai; 881 jmin = 0; 882 if (*ib == i){ /* (diag of A)*x */ 883 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 884 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 885 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 886 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 887 v += 16; jmin++; 888 } 889 for (j=jmin; j<n; j++) { 890 /* (strict lower triangular part of A)*x */ 891 cval = ib[j]*4; 892 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 893 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 894 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 895 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 896 /* (strict upper triangular part of A)*x */ 897 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 898 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 899 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 900 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 901 v += 16; 902 } 903 xb +=4; ai++; 904 } 905 906 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 907 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 908 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 909 910 PetscLogFlops(32*(a->nz*2 - A->m)); 911 PetscFunctionReturn(0); 912 } 913 914 #undef __FUNCT__ 915 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 916 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 917 { 918 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 919 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 920 MatScalar *v; 921 PetscErrorCode ierr; 922 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 923 924 PetscFunctionBegin; 925 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 926 if (yy != xx) { 927 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 928 } else { 929 y = x; 930 } 931 if (zz != yy) { 932 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 933 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 934 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 935 } else { 936 z = y; 937 } 938 939 v = a->a; 940 xb = x; 941 942 for (i=0; i<mbs; i++) { 943 n = ai[1] - ai[0]; /* length of i_th block row of A */ 944 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 945 ib = aj + *ai; 946 jmin = 0; 947 if (*ib == i){ /* (diag of A)*x */ 948 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 949 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 950 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 951 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 952 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 953 v += 25; jmin++; 954 } 955 for (j=jmin; j<n; j++) { 956 /* (strict lower triangular part of A)*x */ 957 cval = ib[j]*5; 958 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 959 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 960 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 961 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 962 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 963 /* (strict upper triangular part of A)*x */ 964 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 965 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 966 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 967 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 968 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 969 v += 25; 970 } 971 xb +=5; ai++; 972 } 973 974 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 975 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 976 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 977 978 PetscLogFlops(50*(a->nz*2 - A->m)); 979 PetscFunctionReturn(0); 980 } 981 #undef __FUNCT__ 982 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 983 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 984 { 985 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 986 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 987 MatScalar *v; 988 PetscErrorCode ierr; 989 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 990 991 PetscFunctionBegin; 992 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 993 if (yy != xx) { 994 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 995 } else { 996 y = x; 997 } 998 if (zz != yy) { 999 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1000 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1001 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1002 } else { 1003 z = y; 1004 } 1005 1006 v = a->a; 1007 xb = x; 1008 1009 for (i=0; i<mbs; i++) { 1010 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1011 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1012 ib = aj + *ai; 1013 jmin = 0; 1014 if (*ib == i){ /* (diag of A)*x */ 1015 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1016 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1017 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1018 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1019 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1020 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1021 v += 36; jmin++; 1022 } 1023 for (j=jmin; j<n; j++) { 1024 /* (strict lower triangular part of A)*x */ 1025 cval = ib[j]*6; 1026 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1027 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1028 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1029 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1030 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1031 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1032 /* (strict upper triangular part of A)*x */ 1033 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1034 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1035 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1036 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1037 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1038 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1039 v += 36; 1040 } 1041 xb +=6; ai++; 1042 } 1043 1044 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1045 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1046 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1047 1048 PetscLogFlops(72*(a->nz*2 - A->m)); 1049 PetscFunctionReturn(0); 1050 } 1051 1052 #undef __FUNCT__ 1053 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1054 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1055 { 1056 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1057 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1058 MatScalar *v; 1059 PetscErrorCode ierr; 1060 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1061 1062 PetscFunctionBegin; 1063 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1064 if (yy != xx) { 1065 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1066 } else { 1067 y = x; 1068 } 1069 if (zz != yy) { 1070 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1071 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1072 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1073 } else { 1074 z = y; 1075 } 1076 1077 v = a->a; 1078 xb = x; 1079 1080 for (i=0; i<mbs; i++) { 1081 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1082 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1083 ib = aj + *ai; 1084 jmin = 0; 1085 if (*ib == i){ /* (diag of A)*x */ 1086 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1087 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1088 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1089 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1090 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1091 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1092 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1093 v += 49; jmin++; 1094 } 1095 for (j=jmin; j<n; j++) { 1096 /* (strict lower triangular part of A)*x */ 1097 cval = ib[j]*7; 1098 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1099 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1100 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1101 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1102 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1103 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1104 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1105 /* (strict upper triangular part of A)*x */ 1106 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1107 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1108 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1109 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1110 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1111 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1112 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1113 v += 49; 1114 } 1115 xb +=7; ai++; 1116 } 1117 1118 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1119 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1120 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1121 1122 PetscLogFlops(98*(a->nz*2 - A->m)); 1123 PetscFunctionReturn(0); 1124 } 1125 1126 #undef __FUNCT__ 1127 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1128 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1129 { 1130 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1131 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1132 MatScalar *v; 1133 PetscErrorCode ierr; 1134 int mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1135 int ncols,k; 1136 1137 PetscFunctionBegin; 1138 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1139 if (yy != xx) { 1140 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1141 } else { 1142 y = x; 1143 } 1144 if (zz != yy) { 1145 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1146 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1147 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1148 } else { 1149 z = y; 1150 } 1151 1152 aj = a->j; 1153 v = a->a; 1154 ii = a->i; 1155 1156 if (!a->mult_work) { 1157 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1158 } 1159 work = a->mult_work; 1160 1161 1162 for (i=0; i<mbs; i++) { 1163 n = ii[1] - ii[0]; ncols = n*bs; 1164 workt = work; idx=aj+ii[0]; 1165 1166 /* upper triangular part */ 1167 for (j=0; j<n; j++) { 1168 xb = x_ptr + bs*(*idx++); 1169 for (k=0; k<bs; k++) workt[k] = xb[k]; 1170 workt += bs; 1171 } 1172 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1173 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1174 1175 /* strict lower triangular part */ 1176 idx = aj+ii[0]; 1177 if (*idx == i){ 1178 ncols -= bs; v += bs2; idx++; n--; 1179 } 1180 if (ncols > 0){ 1181 workt = work; 1182 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1183 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1184 for (j=0; j<n; j++) { 1185 zb = z_ptr + bs*(*idx++); 1186 /* idx++; */ 1187 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1188 workt += bs; 1189 } 1190 } 1191 1192 x += bs; v += n*bs2; z += bs; ii++; 1193 } 1194 1195 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1196 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1197 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1198 1199 PetscLogFlops(2*(a->nz*2 - A->m)); 1200 PetscFunctionReturn(0); 1201 } 1202 1203 #undef __FUNCT__ 1204 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1205 PetscErrorCode MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1206 { 1207 PetscErrorCode ierr; 1208 1209 PetscFunctionBegin; 1210 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1211 PetscFunctionReturn(0); 1212 } 1213 1214 #undef __FUNCT__ 1215 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1216 PetscErrorCode MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1217 1218 { 1219 PetscErrorCode ierr; 1220 1221 PetscFunctionBegin; 1222 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1223 PetscFunctionReturn(0); 1224 } 1225 1226 #undef __FUNCT__ 1227 #define __FUNCT__ "MatScale_SeqSBAIJ" 1228 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1229 { 1230 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1231 int one = 1,totalnz = a->bs2*a->nz; 1232 1233 PetscFunctionBegin; 1234 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1235 PetscLogFlops(totalnz); 1236 PetscFunctionReturn(0); 1237 } 1238 1239 #undef __FUNCT__ 1240 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1241 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1242 { 1243 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1244 MatScalar *v = a->a; 1245 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1246 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1247 PetscErrorCode ierr; 1248 int *jl,*il,jmin,jmax,nexti,ik,*col; 1249 1250 PetscFunctionBegin; 1251 if (type == NORM_FROBENIUS) { 1252 for (k=0; k<mbs; k++){ 1253 jmin = a->i[k]; jmax = a->i[k+1]; 1254 col = aj + jmin; 1255 if (*col == k){ /* diagonal block */ 1256 for (i=0; i<bs2; i++){ 1257 #if defined(PETSC_USE_COMPLEX) 1258 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1259 #else 1260 sum_diag += (*v)*(*v); v++; 1261 #endif 1262 } 1263 jmin++; 1264 } 1265 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1266 for (i=0; i<bs2; i++){ 1267 #if defined(PETSC_USE_COMPLEX) 1268 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1269 #else 1270 sum_off += (*v)*(*v); v++; 1271 #endif 1272 } 1273 } 1274 } 1275 *norm = sqrt(sum_diag + 2*sum_off); 1276 1277 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1278 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1279 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1280 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1281 for (i=0; i<mbs; i++) { 1282 jl[i] = mbs; il[0] = 0; 1283 } 1284 1285 *norm = 0.0; 1286 for (k=0; k<mbs; k++) { /* k_th block row */ 1287 for (j=0; j<bs; j++) sum[j]=0.0; 1288 1289 /*-- col sum --*/ 1290 i = jl[k]; /* first |A(i,k)| to be added */ 1291 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1292 at step k */ 1293 while (i<mbs){ 1294 nexti = jl[i]; /* next block row to be added */ 1295 ik = il[i]; /* block index of A(i,k) in the array a */ 1296 for (j=0; j<bs; j++){ 1297 v = a->a + ik*bs2 + j*bs; 1298 for (k1=0; k1<bs; k1++) { 1299 sum[j] += PetscAbsScalar(*v); v++; 1300 } 1301 } 1302 /* update il, jl */ 1303 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1304 jmax = a->i[i+1]; 1305 if (jmin < jmax){ 1306 il[i] = jmin; 1307 j = a->j[jmin]; 1308 jl[i] = jl[j]; jl[j]=i; 1309 } 1310 i = nexti; 1311 } 1312 1313 /*-- row sum --*/ 1314 jmin = a->i[k]; jmax = a->i[k+1]; 1315 for (i=jmin; i<jmax; i++) { 1316 for (j=0; j<bs; j++){ 1317 v = a->a + i*bs2 + j; 1318 for (k1=0; k1<bs; k1++){ 1319 sum[j] += PetscAbsScalar(*v); 1320 v += bs; 1321 } 1322 } 1323 } 1324 /* add k_th block row to il, jl */ 1325 col = aj+jmin; 1326 if (*col == k) jmin++; 1327 if (jmin < jmax){ 1328 il[k] = jmin; 1329 j = a->j[jmin]; 1330 jl[k] = jl[j]; jl[j] = k; 1331 } 1332 for (j=0; j<bs; j++){ 1333 if (sum[j] > *norm) *norm = sum[j]; 1334 } 1335 } 1336 ierr = PetscFree(il);CHKERRQ(ierr); 1337 ierr = PetscFree(jl);CHKERRQ(ierr); 1338 ierr = PetscFree(sum);CHKERRQ(ierr); 1339 } else { 1340 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1341 } 1342 PetscFunctionReturn(0); 1343 } 1344 1345 #undef __FUNCT__ 1346 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1347 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1348 { 1349 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1350 PetscErrorCode ierr; 1351 1352 PetscFunctionBegin; 1353 1354 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1355 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1356 *flg = PETSC_FALSE; 1357 PetscFunctionReturn(0); 1358 } 1359 1360 /* if the a->i are the same */ 1361 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1362 if (*flg == PETSC_FALSE) { 1363 PetscFunctionReturn(0); 1364 } 1365 1366 /* if a->j are the same */ 1367 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1368 if (*flg == PETSC_FALSE) { 1369 PetscFunctionReturn(0); 1370 } 1371 /* if a->a are the same */ 1372 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1373 1374 PetscFunctionReturn(0); 1375 } 1376 1377 #undef __FUNCT__ 1378 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1379 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1380 { 1381 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1382 PetscErrorCode ierr; 1383 int i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1384 PetscScalar *x,zero = 0.0; 1385 MatScalar *aa,*aa_j; 1386 1387 PetscFunctionBegin; 1388 bs = a->bs; 1389 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1390 1391 aa = a->a; 1392 ai = a->i; 1393 aj = a->j; 1394 ambs = a->mbs; 1395 bs2 = a->bs2; 1396 1397 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1398 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1399 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1400 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1401 for (i=0; i<ambs; i++) { 1402 j=ai[i]; 1403 if (aj[j] == i) { /* if this is a diagonal element */ 1404 row = i*bs; 1405 aa_j = aa + j*bs2; 1406 if (A->factor && bs==1){ 1407 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1408 } else { 1409 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1410 } 1411 } 1412 } 1413 1414 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1415 PetscFunctionReturn(0); 1416 } 1417 1418 #undef __FUNCT__ 1419 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1420 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1421 { 1422 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1423 PetscScalar *l,*r,x,*li,*ri; 1424 MatScalar *aa,*v; 1425 PetscErrorCode ierr; 1426 int i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1427 1428 PetscFunctionBegin; 1429 ai = a->i; 1430 aj = a->j; 1431 aa = a->a; 1432 m = A->m; 1433 bs = a->bs; 1434 mbs = a->mbs; 1435 bs2 = a->bs2; 1436 1437 if (ll != rr) { 1438 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1439 } 1440 if (ll) { 1441 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1442 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1443 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1444 for (i=0; i<mbs; i++) { /* for each block row */ 1445 M = ai[i+1] - ai[i]; 1446 li = l + i*bs; 1447 v = aa + bs2*ai[i]; 1448 for (j=0; j<M; j++) { /* for each block */ 1449 for (k=0; k<bs2; k++) { 1450 (*v++) *= li[k%bs]; 1451 } 1452 #ifdef CONT 1453 /* will be used to replace the above loop */ 1454 ri = l + bs*aj[ai[i]+j]; 1455 for (k=0; k<bs; k++) { /* column value */ 1456 x = ri[k]; 1457 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1458 } 1459 #endif 1460 1461 } 1462 } 1463 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1464 PetscLogFlops(2*a->nz); 1465 } 1466 /* will be deleted */ 1467 if (rr) { 1468 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 1469 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1470 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1471 for (i=0; i<mbs; i++) { /* for each block row */ 1472 M = ai[i+1] - ai[i]; 1473 v = aa + bs2*ai[i]; 1474 for (j=0; j<M; j++) { /* for each block */ 1475 ri = r + bs*aj[ai[i]+j]; 1476 for (k=0; k<bs; k++) { 1477 x = ri[k]; 1478 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1479 } 1480 } 1481 } 1482 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 1483 PetscLogFlops(a->nz); 1484 } 1485 PetscFunctionReturn(0); 1486 } 1487 1488 #undef __FUNCT__ 1489 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1490 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1491 { 1492 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1493 1494 PetscFunctionBegin; 1495 info->rows_global = (double)A->m; 1496 info->columns_global = (double)A->m; 1497 info->rows_local = (double)A->m; 1498 info->columns_local = (double)A->m; 1499 info->block_size = a->bs2; 1500 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1501 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1502 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1503 info->assemblies = A->num_ass; 1504 info->mallocs = a->reallocs; 1505 info->memory = A->mem; 1506 if (A->factor) { 1507 info->fill_ratio_given = A->info.fill_ratio_given; 1508 info->fill_ratio_needed = A->info.fill_ratio_needed; 1509 info->factor_mallocs = A->info.factor_mallocs; 1510 } else { 1511 info->fill_ratio_given = 0; 1512 info->fill_ratio_needed = 0; 1513 info->factor_mallocs = 0; 1514 } 1515 PetscFunctionReturn(0); 1516 } 1517 1518 1519 #undef __FUNCT__ 1520 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1521 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1522 { 1523 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1524 PetscErrorCode ierr; 1525 1526 PetscFunctionBegin; 1527 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1528 PetscFunctionReturn(0); 1529 } 1530 1531 #undef __FUNCT__ 1532 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1533 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1534 { 1535 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1536 PetscErrorCode ierr; 1537 int i,j,n,row,col,bs,*ai,*aj,mbs; 1538 PetscReal atmp; 1539 MatScalar *aa; 1540 PetscScalar zero = 0.0,*x; 1541 int ncols,brow,bcol,krow,kcol; 1542 1543 PetscFunctionBegin; 1544 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1545 bs = a->bs; 1546 aa = a->a; 1547 ai = a->i; 1548 aj = a->j; 1549 mbs = a->mbs; 1550 1551 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1552 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1553 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1554 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1555 for (i=0; i<mbs; i++) { 1556 ncols = ai[1] - ai[0]; ai++; 1557 brow = bs*i; 1558 for (j=0; j<ncols; j++){ 1559 bcol = bs*(*aj); 1560 for (kcol=0; kcol<bs; kcol++){ 1561 col = bcol + kcol; /* col index */ 1562 for (krow=0; krow<bs; krow++){ 1563 atmp = PetscAbsScalar(*aa); aa++; 1564 row = brow + krow; /* row index */ 1565 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1566 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1567 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1568 } 1569 } 1570 aj++; 1571 } 1572 } 1573 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1574 PetscFunctionReturn(0); 1575 } 1576