1 #define PETSCMAT_DLL 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table,table0; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->bs; 24 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table0,bcol)){ 71 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt *irow,nrows,*ssmap,bs=A->bs,bs2=a->bs2; 102 PetscInt *aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscTruth flag; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 110 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ssmap = smap; 117 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 118 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) { 138 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 } 140 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 141 C = *B; 142 } else { 143 ierr = MatCreate(A->comm,&C);CHKERRQ(ierr); 144 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 145 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 146 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 147 } 148 c = (Mat_SeqSBAIJ *)(C->data); 149 for (i=0; i<nrows; i++) { 150 row = irow[i]; 151 kstart = ai[row]; 152 kend = kstart + a->ilen[row]; 153 mat_i = c->i[i]; 154 mat_j = c->j + mat_i; 155 mat_a = c->a + mat_i*bs2; 156 mat_ilen = c->ilen + i; 157 for (k=kstart; k<kend; k++) { 158 if ((tcol=ssmap[a->j[k]])) { 159 *mat_j++ = tcol - 1; 160 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 161 mat_a += bs2; 162 (*mat_ilen)++; 163 } 164 } 165 } 166 167 /* Free work space */ 168 ierr = PetscFree(smap);CHKERRQ(ierr); 169 ierr = PetscFree(lens);CHKERRQ(ierr); 170 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 173 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 174 *B = C; 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 181 { 182 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 183 IS is1; 184 PetscErrorCode ierr; 185 PetscInt *vary,*iary,*irow,nrows,i,bs=A->bs,count; 186 187 PetscFunctionBegin; 188 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 189 190 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 191 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 192 193 /* Verify if the indices corespond to each element in a block 194 and form the IS with compressed IS */ 195 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 196 iary = vary + a->mbs; 197 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 198 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 199 200 count = 0; 201 for (i=0; i<a->mbs; i++) { 202 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 203 if (vary[i]==bs) iary[count++] = i; 204 } 205 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 206 207 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 208 ierr = PetscFree(vary);CHKERRQ(ierr); 209 210 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 211 ISDestroy(is1); 212 PetscFunctionReturn(0); 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 218 { 219 PetscErrorCode ierr; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (scall == MAT_INITIAL_MATRIX) { 224 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 225 } 226 227 for (i=0; i<n; i++) { 228 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 /* -------------------------------------------------------*/ 234 /* Should check that shapes of vectors and matrices match */ 235 /* -------------------------------------------------------*/ 236 #include "petscblaslapack.h" 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 240 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 241 { 242 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 243 PetscScalar *x,*z,*xb,x1,zero=0.0; 244 MatScalar *v; 245 PetscErrorCode ierr; 246 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 247 248 PetscFunctionBegin; 249 ierr = VecSet(zz,zero);CHKERRQ(ierr); 250 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 251 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 252 253 v = a->a; 254 xb = x; 255 256 for (i=0; i<mbs; i++) { 257 n = ai[1] - ai[0]; /* length of i_th row of A */ 258 x1 = xb[0]; 259 ib = aj + *ai; 260 jmin = 0; 261 if (*ib == i) { /* (diag of A)*x */ 262 z[i] += *v++ * x[*ib++]; 263 jmin++; 264 } 265 for (j=jmin; j<n; j++) { 266 cval = *ib; 267 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 268 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 269 } 270 xb++; ai++; 271 } 272 273 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 274 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 275 ierr = PetscLogFlops(2*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); /* nz = (nz+m)/2 */ 276 PetscFunctionReturn(0); 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 281 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 282 { 283 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 284 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 285 MatScalar *v; 286 PetscErrorCode ierr; 287 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 288 289 PetscFunctionBegin; 290 ierr = VecSet(zz,zero);CHKERRQ(ierr); 291 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 292 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 293 294 v = a->a; 295 xb = x; 296 297 for (i=0; i<mbs; i++) { 298 n = ai[1] - ai[0]; /* length of i_th block row of A */ 299 x1 = xb[0]; x2 = xb[1]; 300 ib = aj + *ai; 301 jmin = 0; 302 if (*ib == i){ /* (diag of A)*x */ 303 z[2*i] += v[0]*x1 + v[2]*x2; 304 z[2*i+1] += v[2]*x1 + v[3]*x2; 305 v += 4; jmin++; 306 } 307 for (j=jmin; j<n; j++) { 308 /* (strict lower triangular part of A)*x */ 309 cval = ib[j]*2; 310 z[cval] += v[0]*x1 + v[1]*x2; 311 z[cval+1] += v[2]*x1 + v[3]*x2; 312 /* (strict upper triangular part of A)*x */ 313 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 314 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 315 v += 4; 316 } 317 xb +=2; ai++; 318 } 319 320 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 321 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 322 ierr = PetscLogFlops(8*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 #undef __FUNCT__ 327 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 328 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 329 { 330 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 331 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 332 MatScalar *v; 333 PetscErrorCode ierr; 334 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 335 336 PetscFunctionBegin; 337 ierr = VecSet(zz,zero);CHKERRQ(ierr); 338 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 339 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 340 341 v = a->a; 342 xb = x; 343 344 for (i=0; i<mbs; i++) { 345 n = ai[1] - ai[0]; /* length of i_th block row of A */ 346 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 347 ib = aj + *ai; 348 jmin = 0; 349 if (*ib == i){ /* (diag of A)*x */ 350 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 351 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 352 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 353 v += 9; jmin++; 354 } 355 for (j=jmin; j<n; j++) { 356 /* (strict lower triangular part of A)*x */ 357 cval = ib[j]*3; 358 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 359 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 360 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 361 /* (strict upper triangular part of A)*x */ 362 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 363 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 364 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 365 v += 9; 366 } 367 xb +=3; ai++; 368 } 369 370 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 371 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 372 ierr = PetscLogFlops(18*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 378 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 379 { 380 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 381 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 382 MatScalar *v; 383 PetscErrorCode ierr; 384 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 385 386 PetscFunctionBegin; 387 ierr = VecSet(zz,zero);CHKERRQ(ierr); 388 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 389 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 390 391 v = a->a; 392 xb = x; 393 394 for (i=0; i<mbs; i++) { 395 n = ai[1] - ai[0]; /* length of i_th block row of A */ 396 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 397 ib = aj + *ai; 398 jmin = 0; 399 if (*ib == i){ /* (diag of A)*x */ 400 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 401 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 402 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 403 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 404 v += 16; jmin++; 405 } 406 for (j=jmin; j<n; j++) { 407 /* (strict lower triangular part of A)*x */ 408 cval = ib[j]*4; 409 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 410 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 411 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 412 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 413 /* (strict upper triangular part of A)*x */ 414 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 415 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 416 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 417 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 418 v += 16; 419 } 420 xb +=4; ai++; 421 } 422 423 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 424 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 425 ierr = PetscLogFlops(32*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 426 PetscFunctionReturn(0); 427 } 428 429 #undef __FUNCT__ 430 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 431 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 432 { 433 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 434 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 435 MatScalar *v; 436 PetscErrorCode ierr; 437 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 438 439 PetscFunctionBegin; 440 ierr = VecSet(zz,zero);CHKERRQ(ierr); 441 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 442 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 443 444 v = a->a; 445 xb = x; 446 447 for (i=0; i<mbs; i++) { 448 n = ai[1] - ai[0]; /* length of i_th block row of A */ 449 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 450 ib = aj + *ai; 451 jmin = 0; 452 if (*ib == i){ /* (diag of A)*x */ 453 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 454 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 455 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 456 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 457 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 458 v += 25; jmin++; 459 } 460 for (j=jmin; j<n; j++) { 461 /* (strict lower triangular part of A)*x */ 462 cval = ib[j]*5; 463 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 464 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 465 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 466 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 467 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 468 /* (strict upper triangular part of A)*x */ 469 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 470 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 471 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 472 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 473 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 474 v += 25; 475 } 476 xb +=5; ai++; 477 } 478 479 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 480 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 481 ierr = PetscLogFlops(50*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 482 PetscFunctionReturn(0); 483 } 484 485 486 #undef __FUNCT__ 487 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 488 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 489 { 490 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 491 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 492 MatScalar *v; 493 PetscErrorCode ierr; 494 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 495 496 PetscFunctionBegin; 497 ierr = VecSet(zz,zero);CHKERRQ(ierr); 498 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 499 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 500 501 v = a->a; 502 xb = x; 503 504 for (i=0; i<mbs; i++) { 505 n = ai[1] - ai[0]; /* length of i_th block row of A */ 506 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 507 ib = aj + *ai; 508 jmin = 0; 509 if (*ib == i){ /* (diag of A)*x */ 510 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 511 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 512 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 513 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 514 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 515 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 516 v += 36; jmin++; 517 } 518 for (j=jmin; j<n; j++) { 519 /* (strict lower triangular part of A)*x */ 520 cval = ib[j]*6; 521 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 522 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 523 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 524 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 525 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 526 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 527 /* (strict upper triangular part of A)*x */ 528 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 529 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 530 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 531 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 532 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 533 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 534 v += 36; 535 } 536 xb +=6; ai++; 537 } 538 539 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 540 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 541 ierr = PetscLogFlops(72*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 542 PetscFunctionReturn(0); 543 } 544 #undef __FUNCT__ 545 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 546 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 547 { 548 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 549 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 550 MatScalar *v; 551 PetscErrorCode ierr; 552 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 553 554 PetscFunctionBegin; 555 ierr = VecSet(zz,zero);CHKERRQ(ierr); 556 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 557 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 558 559 v = a->a; 560 xb = x; 561 562 for (i=0; i<mbs; i++) { 563 n = ai[1] - ai[0]; /* length of i_th block row of A */ 564 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 565 ib = aj + *ai; 566 jmin = 0; 567 if (*ib == i){ /* (diag of A)*x */ 568 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 569 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 570 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 571 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 572 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 573 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 574 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 575 v += 49; jmin++; 576 } 577 for (j=jmin; j<n; j++) { 578 /* (strict lower triangular part of A)*x */ 579 cval = ib[j]*7; 580 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 581 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 582 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 583 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 584 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 585 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 586 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 587 /* (strict upper triangular part of A)*x */ 588 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 589 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 590 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 591 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 592 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 593 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 594 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 595 v += 49; 596 } 597 xb +=7; ai++; 598 } 599 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 601 ierr = PetscLogFlops(98*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 602 PetscFunctionReturn(0); 603 } 604 605 /* 606 This will not work with MatScalar == float because it calls the BLAS 607 */ 608 #undef __FUNCT__ 609 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 610 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 611 { 612 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 613 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 614 MatScalar *v; 615 PetscErrorCode ierr; 616 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k; 617 618 PetscFunctionBegin; 619 ierr = VecSet(zz,zero);CHKERRQ(ierr); 620 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 621 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 622 623 aj = a->j; 624 v = a->a; 625 ii = a->i; 626 627 if (!a->mult_work) { 628 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 629 } 630 work = a->mult_work; 631 632 for (i=0; i<mbs; i++) { 633 n = ii[1] - ii[0]; ncols = n*bs; 634 workt = work; idx=aj+ii[0]; 635 636 /* upper triangular part */ 637 for (j=0; j<n; j++) { 638 xb = x_ptr + bs*(*idx++); 639 for (k=0; k<bs; k++) workt[k] = xb[k]; 640 workt += bs; 641 } 642 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 643 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 644 645 /* strict lower triangular part */ 646 idx = aj+ii[0]; 647 if (*idx == i){ 648 ncols -= bs; v += bs2; idx++; n--; 649 } 650 651 if (ncols > 0){ 652 workt = work; 653 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 654 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 655 for (j=0; j<n; j++) { 656 zb = z_ptr + bs*(*idx++); 657 for (k=0; k<bs; k++) zb[k] += workt[k] ; 658 workt += bs; 659 } 660 } 661 x += bs; v += n*bs2; z += bs; ii++; 662 } 663 664 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 665 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 666 ierr = PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);CHKERRQ(ierr); 667 PetscFunctionReturn(0); 668 } 669 670 #undef __FUNCT__ 671 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 672 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 673 { 674 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 675 PetscScalar *x,*y,*z,*xb,x1; 676 MatScalar *v; 677 PetscErrorCode ierr; 678 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 679 680 PetscFunctionBegin; 681 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 682 if (yy != xx) { 683 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 684 } else { 685 y = x; 686 } 687 if (zz != yy) { 688 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 689 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 690 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 691 } else { 692 z = y; 693 } 694 695 v = a->a; 696 xb = x; 697 698 for (i=0; i<mbs; i++) { 699 n = ai[1] - ai[0]; /* length of i_th row of A */ 700 x1 = xb[0]; 701 ib = aj + *ai; 702 jmin = 0; 703 if (*ib == i) { /* (diag of A)*x */ 704 z[i] += *v++ * x[*ib++]; jmin++; 705 } 706 for (j=jmin; j<n; j++) { 707 cval = *ib; 708 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 709 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 710 } 711 xb++; ai++; 712 } 713 714 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 715 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 716 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 717 718 ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr); 719 PetscFunctionReturn(0); 720 } 721 722 #undef __FUNCT__ 723 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 724 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 725 { 726 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 727 PetscScalar *x,*y,*z,*xb,x1,x2; 728 MatScalar *v; 729 PetscErrorCode ierr; 730 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 731 732 PetscFunctionBegin; 733 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 734 if (yy != xx) { 735 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 736 } else { 737 y = x; 738 } 739 if (zz != yy) { 740 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 741 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 742 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 743 } else { 744 z = y; 745 } 746 747 v = a->a; 748 xb = x; 749 750 for (i=0; i<mbs; i++) { 751 n = ai[1] - ai[0]; /* length of i_th block row of A */ 752 x1 = xb[0]; x2 = xb[1]; 753 ib = aj + *ai; 754 jmin = 0; 755 if (*ib == i){ /* (diag of A)*x */ 756 z[2*i] += v[0]*x1 + v[2]*x2; 757 z[2*i+1] += v[2]*x1 + v[3]*x2; 758 v += 4; jmin++; 759 } 760 for (j=jmin; j<n; j++) { 761 /* (strict lower triangular part of A)*x */ 762 cval = ib[j]*2; 763 z[cval] += v[0]*x1 + v[1]*x2; 764 z[cval+1] += v[2]*x1 + v[3]*x2; 765 /* (strict upper triangular part of A)*x */ 766 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 767 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 768 v += 4; 769 } 770 xb +=2; ai++; 771 } 772 773 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 774 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 775 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 776 777 ierr = PetscLogFlops(4*(a->nz*2 - A->m));CHKERRQ(ierr); 778 PetscFunctionReturn(0); 779 } 780 781 #undef __FUNCT__ 782 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 783 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 784 { 785 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 786 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 787 MatScalar *v; 788 PetscErrorCode ierr; 789 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 790 791 PetscFunctionBegin; 792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 793 if (yy != xx) { 794 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 795 } else { 796 y = x; 797 } 798 if (zz != yy) { 799 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 800 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 801 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 802 } else { 803 z = y; 804 } 805 806 v = a->a; 807 xb = x; 808 809 for (i=0; i<mbs; i++) { 810 n = ai[1] - ai[0]; /* length of i_th block row of A */ 811 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 812 ib = aj + *ai; 813 jmin = 0; 814 if (*ib == i){ /* (diag of A)*x */ 815 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 816 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 817 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 818 v += 9; jmin++; 819 } 820 for (j=jmin; j<n; j++) { 821 /* (strict lower triangular part of A)*x */ 822 cval = ib[j]*3; 823 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 824 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 825 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 826 /* (strict upper triangular part of A)*x */ 827 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 828 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 829 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 830 v += 9; 831 } 832 xb +=3; ai++; 833 } 834 835 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 836 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 837 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 838 839 ierr = PetscLogFlops(18*(a->nz*2 - A->m));CHKERRQ(ierr); 840 PetscFunctionReturn(0); 841 } 842 843 #undef __FUNCT__ 844 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 845 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 846 { 847 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 848 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 849 MatScalar *v; 850 PetscErrorCode ierr; 851 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 852 853 PetscFunctionBegin; 854 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 855 if (yy != xx) { 856 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 857 } else { 858 y = x; 859 } 860 if (zz != yy) { 861 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 862 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 863 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 864 } else { 865 z = y; 866 } 867 868 v = a->a; 869 xb = x; 870 871 for (i=0; i<mbs; i++) { 872 n = ai[1] - ai[0]; /* length of i_th block row of A */ 873 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 874 ib = aj + *ai; 875 jmin = 0; 876 if (*ib == i){ /* (diag of A)*x */ 877 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 878 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 879 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 880 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 881 v += 16; jmin++; 882 } 883 for (j=jmin; j<n; j++) { 884 /* (strict lower triangular part of A)*x */ 885 cval = ib[j]*4; 886 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 887 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 888 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 889 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 890 /* (strict upper triangular part of A)*x */ 891 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 892 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 893 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 894 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 895 v += 16; 896 } 897 xb +=4; ai++; 898 } 899 900 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 901 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 902 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 903 904 ierr = PetscLogFlops(32*(a->nz*2 - A->m));CHKERRQ(ierr); 905 PetscFunctionReturn(0); 906 } 907 908 #undef __FUNCT__ 909 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 910 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 911 { 912 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 913 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 914 MatScalar *v; 915 PetscErrorCode ierr; 916 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 917 918 PetscFunctionBegin; 919 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 920 if (yy != xx) { 921 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 922 } else { 923 y = x; 924 } 925 if (zz != yy) { 926 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 927 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 928 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 929 } else { 930 z = y; 931 } 932 933 v = a->a; 934 xb = x; 935 936 for (i=0; i<mbs; i++) { 937 n = ai[1] - ai[0]; /* length of i_th block row of A */ 938 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 939 ib = aj + *ai; 940 jmin = 0; 941 if (*ib == i){ /* (diag of A)*x */ 942 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 943 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 944 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 945 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 946 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 947 v += 25; jmin++; 948 } 949 for (j=jmin; j<n; j++) { 950 /* (strict lower triangular part of A)*x */ 951 cval = ib[j]*5; 952 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 953 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 954 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 955 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 956 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 957 /* (strict upper triangular part of A)*x */ 958 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 959 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 960 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 961 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 962 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 963 v += 25; 964 } 965 xb +=5; ai++; 966 } 967 968 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 969 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 970 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 971 972 ierr = PetscLogFlops(50*(a->nz*2 - A->m));CHKERRQ(ierr); 973 PetscFunctionReturn(0); 974 } 975 #undef __FUNCT__ 976 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 977 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 978 { 979 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 980 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 981 MatScalar *v; 982 PetscErrorCode ierr; 983 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 984 985 PetscFunctionBegin; 986 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 987 if (yy != xx) { 988 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 989 } else { 990 y = x; 991 } 992 if (zz != yy) { 993 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 994 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 995 } else { 996 z = y; 997 } 998 999 v = a->a; 1000 xb = x; 1001 1002 for (i=0; i<mbs; i++) { 1003 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1004 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1005 ib = aj + *ai; 1006 jmin = 0; 1007 if (*ib == i){ /* (diag of A)*x */ 1008 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1009 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1010 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1011 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1012 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1013 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1014 v += 36; jmin++; 1015 } 1016 for (j=jmin; j<n; j++) { 1017 /* (strict lower triangular part of A)*x */ 1018 cval = ib[j]*6; 1019 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1020 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1021 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1022 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1023 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1024 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1025 /* (strict upper triangular part of A)*x */ 1026 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1027 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1028 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1029 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1030 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1031 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1032 v += 36; 1033 } 1034 xb +=6; ai++; 1035 } 1036 1037 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1038 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1039 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1040 1041 ierr = PetscLogFlops(72*(a->nz*2 - A->m));CHKERRQ(ierr); 1042 PetscFunctionReturn(0); 1043 } 1044 1045 #undef __FUNCT__ 1046 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1047 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1048 { 1049 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1050 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1051 MatScalar *v; 1052 PetscErrorCode ierr; 1053 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1054 1055 PetscFunctionBegin; 1056 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1057 if (yy != xx) { 1058 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1059 } else { 1060 y = x; 1061 } 1062 if (zz != yy) { 1063 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1064 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1065 } else { 1066 z = y; 1067 } 1068 1069 v = a->a; 1070 xb = x; 1071 1072 for (i=0; i<mbs; i++) { 1073 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1074 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1075 ib = aj + *ai; 1076 jmin = 0; 1077 if (*ib == i){ /* (diag of A)*x */ 1078 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1079 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1080 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1081 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1082 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1083 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1084 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1085 v += 49; jmin++; 1086 } 1087 for (j=jmin; j<n; j++) { 1088 /* (strict lower triangular part of A)*x */ 1089 cval = ib[j]*7; 1090 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1091 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1092 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1093 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1094 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1095 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1096 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1097 /* (strict upper triangular part of A)*x */ 1098 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1099 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1100 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1101 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1102 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1103 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1104 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1105 v += 49; 1106 } 1107 xb +=7; ai++; 1108 } 1109 1110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1111 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1112 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1113 1114 ierr = PetscLogFlops(98*(a->nz*2 - A->m));CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 #undef __FUNCT__ 1119 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1120 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1121 { 1122 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1123 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1124 MatScalar *v; 1125 PetscErrorCode ierr; 1126 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k; 1127 1128 PetscFunctionBegin; 1129 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1130 if (yy != xx) { 1131 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1132 } else { 1133 y = x; 1134 } 1135 if (zz != yy) { 1136 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1137 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1138 } else { 1139 z = y; 1140 } 1141 1142 aj = a->j; 1143 v = a->a; 1144 ii = a->i; 1145 1146 if (!a->mult_work) { 1147 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1148 } 1149 work = a->mult_work; 1150 1151 1152 for (i=0; i<mbs; i++) { 1153 n = ii[1] - ii[0]; ncols = n*bs; 1154 workt = work; idx=aj+ii[0]; 1155 1156 /* upper triangular part */ 1157 for (j=0; j<n; j++) { 1158 xb = x_ptr + bs*(*idx++); 1159 for (k=0; k<bs; k++) workt[k] = xb[k]; 1160 workt += bs; 1161 } 1162 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1163 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1164 1165 /* strict lower triangular part */ 1166 idx = aj+ii[0]; 1167 if (*idx == i){ 1168 ncols -= bs; v += bs2; idx++; n--; 1169 } 1170 if (ncols > 0){ 1171 workt = work; 1172 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1173 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1174 for (j=0; j<n; j++) { 1175 zb = z_ptr + bs*(*idx++); 1176 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1177 workt += bs; 1178 } 1179 } 1180 1181 x += bs; v += n*bs2; z += bs; ii++; 1182 } 1183 1184 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1185 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1186 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1187 1188 ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr); 1189 PetscFunctionReturn(0); 1190 } 1191 1192 #undef __FUNCT__ 1193 #define __FUNCT__ "MatScale_SeqSBAIJ" 1194 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1195 { 1196 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1197 PetscScalar oalpha = alpha; 1198 PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz; 1199 PetscErrorCode ierr; 1200 1201 PetscFunctionBegin; 1202 BLASscal_(&totalnz,&oalpha,a->a,&one); 1203 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1204 PetscFunctionReturn(0); 1205 } 1206 1207 #undef __FUNCT__ 1208 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1209 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1210 { 1211 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1212 MatScalar *v = a->a; 1213 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1214 PetscInt i,j,k,bs = A->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1215 PetscErrorCode ierr; 1216 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1217 1218 PetscFunctionBegin; 1219 if (type == NORM_FROBENIUS) { 1220 for (k=0; k<mbs; k++){ 1221 jmin = a->i[k]; jmax = a->i[k+1]; 1222 col = aj + jmin; 1223 if (*col == k){ /* diagonal block */ 1224 for (i=0; i<bs2; i++){ 1225 #if defined(PETSC_USE_COMPLEX) 1226 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1227 #else 1228 sum_diag += (*v)*(*v); v++; 1229 #endif 1230 } 1231 jmin++; 1232 } 1233 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1234 for (i=0; i<bs2; i++){ 1235 #if defined(PETSC_USE_COMPLEX) 1236 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1237 #else 1238 sum_off += (*v)*(*v); v++; 1239 #endif 1240 } 1241 } 1242 } 1243 *norm = sqrt(sum_diag + 2*sum_off); 1244 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1245 ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr); 1246 jl = il + mbs; 1247 sum = (PetscReal*)(jl + mbs); 1248 for (i=0; i<mbs; i++) jl[i] = mbs; 1249 il[0] = 0; 1250 1251 *norm = 0.0; 1252 for (k=0; k<mbs; k++) { /* k_th block row */ 1253 for (j=0; j<bs; j++) sum[j]=0.0; 1254 /*-- col sum --*/ 1255 i = jl[k]; /* first |A(i,k)| to be added */ 1256 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1257 at step k */ 1258 while (i<mbs){ 1259 nexti = jl[i]; /* next block row to be added */ 1260 ik = il[i]; /* block index of A(i,k) in the array a */ 1261 for (j=0; j<bs; j++){ 1262 v = a->a + ik*bs2 + j*bs; 1263 for (k1=0; k1<bs; k1++) { 1264 sum[j] += PetscAbsScalar(*v); v++; 1265 } 1266 } 1267 /* update il, jl */ 1268 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1269 jmax = a->i[i+1]; 1270 if (jmin < jmax){ 1271 il[i] = jmin; 1272 j = a->j[jmin]; 1273 jl[i] = jl[j]; jl[j]=i; 1274 } 1275 i = nexti; 1276 } 1277 /*-- row sum --*/ 1278 jmin = a->i[k]; jmax = a->i[k+1]; 1279 for (i=jmin; i<jmax; i++) { 1280 for (j=0; j<bs; j++){ 1281 v = a->a + i*bs2 + j; 1282 for (k1=0; k1<bs; k1++){ 1283 sum[j] += PetscAbsScalar(*v); v += bs; 1284 } 1285 } 1286 } 1287 /* add k_th block row to il, jl */ 1288 col = aj+jmin; 1289 if (*col == k) jmin++; 1290 if (jmin < jmax){ 1291 il[k] = jmin; 1292 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1293 } 1294 for (j=0; j<bs; j++){ 1295 if (sum[j] > *norm) *norm = sum[j]; 1296 } 1297 } 1298 ierr = PetscFree(il);CHKERRQ(ierr); 1299 } else { 1300 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1301 } 1302 PetscFunctionReturn(0); 1303 } 1304 1305 #undef __FUNCT__ 1306 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1307 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1308 { 1309 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1310 PetscErrorCode ierr; 1311 1312 PetscFunctionBegin; 1313 1314 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1315 if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) { 1316 *flg = PETSC_FALSE; 1317 PetscFunctionReturn(0); 1318 } 1319 1320 /* if the a->i are the same */ 1321 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1322 if (!*flg) { 1323 PetscFunctionReturn(0); 1324 } 1325 1326 /* if a->j are the same */ 1327 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1328 if (!*flg) { 1329 PetscFunctionReturn(0); 1330 } 1331 /* if a->a are the same */ 1332 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(A->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1333 PetscFunctionReturn(0); 1334 } 1335 1336 #undef __FUNCT__ 1337 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1338 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1339 { 1340 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1341 PetscErrorCode ierr; 1342 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1343 PetscScalar *x,zero = 0.0; 1344 MatScalar *aa,*aa_j; 1345 1346 PetscFunctionBegin; 1347 bs = A->bs; 1348 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1349 1350 aa = a->a; 1351 ai = a->i; 1352 aj = a->j; 1353 ambs = a->mbs; 1354 bs2 = a->bs2; 1355 1356 ierr = VecSet(v,zero);CHKERRQ(ierr); 1357 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1358 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1359 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1360 for (i=0; i<ambs; i++) { 1361 j=ai[i]; 1362 if (aj[j] == i) { /* if this is a diagonal element */ 1363 row = i*bs; 1364 aa_j = aa + j*bs2; 1365 if (A->factor && bs==1){ 1366 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1367 } else { 1368 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1369 } 1370 } 1371 } 1372 1373 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1374 PetscFunctionReturn(0); 1375 } 1376 1377 #undef __FUNCT__ 1378 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1379 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1380 { 1381 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1382 PetscScalar *l,x,*li,*ri; 1383 MatScalar *aa,*v; 1384 PetscErrorCode ierr; 1385 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1386 PetscTruth flg; 1387 1388 PetscFunctionBegin; 1389 if (ll != rr){ 1390 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1391 if (!flg) 1392 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1393 } 1394 if (!ll) PetscFunctionReturn(0); 1395 ai = a->i; 1396 aj = a->j; 1397 aa = a->a; 1398 m = A->m; 1399 bs = A->bs; 1400 mbs = a->mbs; 1401 bs2 = a->bs2; 1402 1403 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1404 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1405 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1406 for (i=0; i<mbs; i++) { /* for each block row */ 1407 M = ai[i+1] - ai[i]; 1408 li = l + i*bs; 1409 v = aa + bs2*ai[i]; 1410 for (j=0; j<M; j++) { /* for each block */ 1411 ri = l + bs*aj[ai[i]+j]; 1412 for (k=0; k<bs; k++) { 1413 x = ri[k]; 1414 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1415 } 1416 } 1417 } 1418 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1419 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 1420 PetscFunctionReturn(0); 1421 } 1422 1423 #undef __FUNCT__ 1424 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1425 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1426 { 1427 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1428 1429 PetscFunctionBegin; 1430 info->rows_global = (double)A->m; 1431 info->columns_global = (double)A->m; 1432 info->rows_local = (double)A->m; 1433 info->columns_local = (double)A->m; 1434 info->block_size = a->bs2; 1435 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1436 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1437 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1438 info->assemblies = A->num_ass; 1439 info->mallocs = a->reallocs; 1440 info->memory = A->mem; 1441 if (A->factor) { 1442 info->fill_ratio_given = A->info.fill_ratio_given; 1443 info->fill_ratio_needed = A->info.fill_ratio_needed; 1444 info->factor_mallocs = A->info.factor_mallocs; 1445 } else { 1446 info->fill_ratio_given = 0; 1447 info->fill_ratio_needed = 0; 1448 info->factor_mallocs = 0; 1449 } 1450 PetscFunctionReturn(0); 1451 } 1452 1453 1454 #undef __FUNCT__ 1455 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1456 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1457 { 1458 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1459 PetscErrorCode ierr; 1460 1461 PetscFunctionBegin; 1462 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1463 PetscFunctionReturn(0); 1464 } 1465 1466 #undef __FUNCT__ 1467 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1468 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1469 { 1470 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1471 PetscErrorCode ierr; 1472 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1473 PetscReal atmp; 1474 MatScalar *aa; 1475 PetscScalar zero = 0.0,*x; 1476 PetscInt ncols,brow,bcol,krow,kcol; 1477 1478 PetscFunctionBegin; 1479 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1480 bs = A->bs; 1481 aa = a->a; 1482 ai = a->i; 1483 aj = a->j; 1484 mbs = a->mbs; 1485 1486 ierr = VecSet(v,zero);CHKERRQ(ierr); 1487 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1488 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1489 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1490 for (i=0; i<mbs; i++) { 1491 ncols = ai[1] - ai[0]; ai++; 1492 brow = bs*i; 1493 for (j=0; j<ncols; j++){ 1494 bcol = bs*(*aj); 1495 for (kcol=0; kcol<bs; kcol++){ 1496 col = bcol + kcol; /* col index */ 1497 for (krow=0; krow<bs; krow++){ 1498 atmp = PetscAbsScalar(*aa); aa++; 1499 row = brow + krow; /* row index */ 1500 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1501 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1502 } 1503 } 1504 aj++; 1505 } 1506 } 1507 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1508 PetscFunctionReturn(0); 1509 } 1510