1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "petscblaslapack.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 15 const PetscInt *idx; 16 PetscInt start,end,*ai,*aj,bs,*nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 28 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 29 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 30 31 for (i=0; i<is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 38 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j=0; j<n ; ++j){ 42 ival = idx[j]/bs; /* convert the indices into block indices */ 43 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++){ /* for each overlap*/ 51 n = isz; 52 for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row+1]; 56 for (l = start; l<end ; l++){ 57 val = aj[l]; 58 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j=0; j<isz; j++) { 64 for (k=0; k<bs; k++) 65 nidx2[j*bs+k] = nidx[j]*bs+k; 66 } 67 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 68 } 69 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 70 ierr = PetscFree(nidx);CHKERRQ(ierr); 71 ierr = PetscFree(nidx2);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 #undef __FUNCT__ 76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 78 { 79 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 80 PetscErrorCode ierr; 81 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 82 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 83 const PetscInt *irow,*icol; 84 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 85 PetscInt *aj = a->j,*ai = a->i; 86 MatScalar *mat_a; 87 Mat C; 88 PetscTruth flag,sorted; 89 90 PetscFunctionBegin; 91 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 92 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 93 94 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 95 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 98 99 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 100 ssmap = smap; 101 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 102 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 103 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 104 /* determine lens of each row */ 105 for (i=0; i<nrows; i++) { 106 kstart = ai[irow[i]]; 107 kend = kstart + a->ilen[irow[i]]; 108 lens[i] = 0; 109 for (k=kstart; k<kend; k++) { 110 if (ssmap[aj[k]]) { 111 lens[i]++; 112 } 113 } 114 } 115 /* Create and fill new matrix */ 116 if (scall == MAT_REUSE_MATRIX) { 117 c = (Mat_SeqBAIJ *)((*B)->data); 118 119 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 120 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 121 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 122 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 123 C = *B; 124 } else { 125 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 126 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 127 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 128 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 129 } 130 c = (Mat_SeqBAIJ *)(C->data); 131 for (i=0; i<nrows; i++) { 132 row = irow[i]; 133 kstart = ai[row]; 134 kend = kstart + a->ilen[row]; 135 mat_i = c->i[i]; 136 mat_j = c->j + mat_i; 137 mat_a = c->a + mat_i*bs2; 138 mat_ilen = c->ilen + i; 139 for (k=kstart; k<kend; k++) { 140 if ((tcol=ssmap[a->j[k]])) { 141 *mat_j++ = tcol - 1; 142 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 143 mat_a += bs2; 144 (*mat_ilen)++; 145 } 146 } 147 } 148 149 /* Free work space */ 150 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 151 ierr = PetscFree(smap);CHKERRQ(ierr); 152 ierr = PetscFree(lens);CHKERRQ(ierr); 153 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 154 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 155 156 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 157 *B = C; 158 PetscFunctionReturn(0); 159 } 160 161 #undef __FUNCT__ 162 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 164 { 165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 166 IS is1,is2; 167 PetscErrorCode ierr; 168 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 169 const PetscInt *irow,*icol; 170 171 PetscFunctionBegin; 172 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 173 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 176 177 /* Verify if the indices corespond to each element in a block 178 and form the IS with compressed IS */ 179 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 180 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 181 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 182 count = 0; 183 for (i=0; i<a->mbs; i++) { 184 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 185 if (vary[i]==bs) iary[count++] = i; 186 } 187 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 188 189 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 190 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 191 count = 0; 192 for (i=0; i<a->mbs; i++) { 193 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 194 if (vary[i]==bs) iary[count++] = i; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr); 197 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 198 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 199 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 200 201 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 202 ISDestroy(is1); 203 ISDestroy(is2); 204 PetscFunctionReturn(0); 205 } 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 209 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 210 { 211 PetscErrorCode ierr; 212 PetscInt i; 213 214 PetscFunctionBegin; 215 if (scall == MAT_INITIAL_MATRIX) { 216 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 217 } 218 219 for (i=0; i<n; i++) { 220 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 226 /* -------------------------------------------------------*/ 227 /* Should check that shapes of vectors and matrices match */ 228 /* -------------------------------------------------------*/ 229 230 #undef __FUNCT__ 231 #define __FUNCT__ "MatMult_SeqBAIJ_1" 232 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 233 { 234 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 235 PetscScalar *z,sum; 236 const PetscScalar *x; 237 const MatScalar *v; 238 PetscErrorCode ierr; 239 PetscInt mbs,i,n,nonzerorow=0; 240 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 241 PetscTruth usecprow=a->compressedrow.use; 242 243 PetscFunctionBegin; 244 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 245 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 246 247 if (usecprow){ 248 mbs = a->compressedrow.nrows; 249 ii = a->compressedrow.i; 250 ridx = a->compressedrow.rindex; 251 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 252 } else { 253 mbs = a->mbs; 254 ii = a->i; 255 } 256 257 for (i=0; i<mbs; i++) { 258 n = ii[1] - ii[0]; 259 v = a->a + ii[0]; 260 idx = a->j + ii[0]; 261 ii++; 262 sum = 0.0; 263 PetscSparseDensePlusDot(sum,x,v,idx,n); 264 if (usecprow){ 265 z[ridx[i]] = sum; 266 } else { 267 nonzerorow += (n>0); 268 z[i] = sum; 269 } 270 } 271 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 272 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 273 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatMult_SeqBAIJ_2" 279 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 282 PetscScalar *z = 0,sum1,sum2,*zarray; 283 const PetscScalar *x,*xb; 284 PetscScalar x1,x2; 285 const MatScalar *v; 286 PetscErrorCode ierr; 287 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 288 PetscTruth usecprow=a->compressedrow.use; 289 290 PetscFunctionBegin; 291 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 292 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 293 294 idx = a->j; 295 v = a->a; 296 if (usecprow){ 297 mbs = a->compressedrow.nrows; 298 ii = a->compressedrow.i; 299 ridx = a->compressedrow.rindex; 300 } else { 301 mbs = a->mbs; 302 ii = a->i; 303 z = zarray; 304 } 305 306 for (i=0; i<mbs; i++) { 307 n = ii[1] - ii[0]; ii++; 308 sum1 = 0.0; sum2 = 0.0; 309 nonzerorow += (n>0); 310 for (j=0; j<n; j++) { 311 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 312 sum1 += v[0]*x1 + v[2]*x2; 313 sum2 += v[1]*x1 + v[3]*x2; 314 v += 4; 315 } 316 if (usecprow) z = zarray + 2*ridx[i]; 317 z[0] = sum1; z[1] = sum2; 318 if (!usecprow) z += 2; 319 } 320 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 321 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 322 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 #undef __FUNCT__ 327 #define __FUNCT__ "MatMult_SeqBAIJ_3" 328 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 329 { 330 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 331 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 332 const PetscScalar *x,*xb; 333 const MatScalar *v; 334 PetscErrorCode ierr; 335 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 336 PetscTruth usecprow=a->compressedrow.use; 337 338 339 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 340 #pragma disjoint(*v,*z,*xb) 341 #endif 342 343 PetscFunctionBegin; 344 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 345 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 346 347 idx = a->j; 348 v = a->a; 349 if (usecprow){ 350 mbs = a->compressedrow.nrows; 351 ii = a->compressedrow.i; 352 ridx = a->compressedrow.rindex; 353 } else { 354 mbs = a->mbs; 355 ii = a->i; 356 z = zarray; 357 } 358 359 for (i=0; i<mbs; i++) { 360 n = ii[1] - ii[0]; ii++; 361 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 362 nonzerorow += (n>0); 363 for (j=0; j<n; j++) { 364 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 365 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 366 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 367 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 368 v += 9; 369 } 370 if (usecprow) z = zarray + 3*ridx[i]; 371 z[0] = sum1; z[1] = sum2; z[2] = sum3; 372 if (!usecprow) z += 3; 373 } 374 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 375 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 376 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 #undef __FUNCT__ 381 #define __FUNCT__ "MatMult_SeqBAIJ_4" 382 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 383 { 384 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 385 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 386 const PetscScalar *x,*xb; 387 const MatScalar *v; 388 PetscErrorCode ierr; 389 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 390 PetscTruth usecprow=a->compressedrow.use; 391 392 PetscFunctionBegin; 393 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 394 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 395 396 idx = a->j; 397 v = a->a; 398 if (usecprow){ 399 mbs = a->compressedrow.nrows; 400 ii = a->compressedrow.i; 401 ridx = a->compressedrow.rindex; 402 } else { 403 mbs = a->mbs; 404 ii = a->i; 405 z = zarray; 406 } 407 408 for (i=0; i<mbs; i++) { 409 n = ii[1] - ii[0]; ii++; 410 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 411 nonzerorow += (n>0); 412 for (j=0; j<n; j++) { 413 xb = x + 4*(*idx++); 414 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 415 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 416 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 417 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 418 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 419 v += 16; 420 } 421 if (usecprow) z = zarray + 4*ridx[i]; 422 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 423 if (!usecprow) z += 4; 424 } 425 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 426 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 427 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 #undef __FUNCT__ 432 #define __FUNCT__ "MatMult_SeqBAIJ_5" 433 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 434 { 435 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 436 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 437 const PetscScalar *xb,*x; 438 const MatScalar *v; 439 PetscErrorCode ierr; 440 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 441 PetscInt mbs,i,j,n,nonzerorow=0; 442 PetscTruth usecprow=a->compressedrow.use; 443 444 PetscFunctionBegin; 445 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 446 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 447 448 idx = a->j; 449 v = a->a; 450 if (usecprow){ 451 mbs = a->compressedrow.nrows; 452 ii = a->compressedrow.i; 453 ridx = a->compressedrow.rindex; 454 } else { 455 mbs = a->mbs; 456 ii = a->i; 457 z = zarray; 458 } 459 460 for (i=0; i<mbs; i++) { 461 n = ii[1] - ii[0]; ii++; 462 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 463 nonzerorow += (n>0); 464 for (j=0; j<n; j++) { 465 xb = x + 5*(*idx++); 466 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 467 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 468 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 469 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 470 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 471 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 472 v += 25; 473 } 474 if (usecprow) z = zarray + 5*ridx[i]; 475 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 476 if (!usecprow) z += 5; 477 } 478 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 479 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 480 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 481 PetscFunctionReturn(0); 482 } 483 484 485 #undef __FUNCT__ 486 #define __FUNCT__ "MatMult_SeqBAIJ_6" 487 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 488 { 489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 490 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 491 const PetscScalar *x,*xb; 492 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 493 const MatScalar *v; 494 PetscErrorCode ierr; 495 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 496 PetscTruth usecprow=a->compressedrow.use; 497 498 PetscFunctionBegin; 499 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 500 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 501 502 idx = a->j; 503 v = a->a; 504 if (usecprow){ 505 mbs = a->compressedrow.nrows; 506 ii = a->compressedrow.i; 507 ridx = a->compressedrow.rindex; 508 } else { 509 mbs = a->mbs; 510 ii = a->i; 511 z = zarray; 512 } 513 514 for (i=0; i<mbs; i++) { 515 n = ii[1] - ii[0]; ii++; 516 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 517 nonzerorow += (n>0); 518 for (j=0; j<n; j++) { 519 xb = x + 6*(*idx++); 520 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 521 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 522 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 523 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 524 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 525 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 526 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 527 v += 36; 528 } 529 if (usecprow) z = zarray + 6*ridx[i]; 530 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 531 if (!usecprow) z += 6; 532 } 533 534 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 535 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 536 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 #undef __FUNCT__ 541 #define __FUNCT__ "MatMult_SeqBAIJ_7" 542 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 543 { 544 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 545 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 546 const PetscScalar *x,*xb; 547 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 548 const MatScalar *v; 549 PetscErrorCode ierr; 550 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 551 PetscTruth usecprow=a->compressedrow.use; 552 553 PetscFunctionBegin; 554 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 555 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 556 557 idx = a->j; 558 v = a->a; 559 if (usecprow){ 560 mbs = a->compressedrow.nrows; 561 ii = a->compressedrow.i; 562 ridx = a->compressedrow.rindex; 563 } else { 564 mbs = a->mbs; 565 ii = a->i; 566 z = zarray; 567 } 568 569 for (i=0; i<mbs; i++) { 570 n = ii[1] - ii[0]; ii++; 571 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 572 nonzerorow += (n>0); 573 for (j=0; j<n; j++) { 574 xb = x + 7*(*idx++); 575 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 576 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 577 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 578 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 579 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 580 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 581 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 582 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 583 v += 49; 584 } 585 if (usecprow) z = zarray + 7*ridx[i]; 586 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 587 if (!usecprow) z += 7; 588 } 589 590 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 591 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 592 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 593 PetscFunctionReturn(0); 594 } 595 596 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 597 /* Default MatMult for block size 15 */ 598 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 601 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 602 { 603 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 604 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 605 const PetscScalar *x,*xb; 606 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 607 const MatScalar *v; 608 PetscErrorCode ierr; 609 const PetscInt *ii,*ij=a->j,*idx; 610 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 611 PetscTruth usecprow=a->compressedrow.use; 612 613 PetscFunctionBegin; 614 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 615 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 616 617 v = a->a; 618 if (usecprow){ 619 mbs = a->compressedrow.nrows; 620 ii = a->compressedrow.i; 621 ridx = a->compressedrow.rindex; 622 } else { 623 mbs = a->mbs; 624 ii = a->i; 625 z = zarray; 626 } 627 628 for (i=0; i<mbs; i++) { 629 n = ii[i+1] - ii[i]; 630 idx = ij + ii[i]; 631 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 632 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 633 634 nonzerorow += (n>0); 635 for (j=0; j<n; j++) { 636 xb = x + 15*(idx[j]); 637 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 638 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 639 640 for(k=0;k<15;k++){ 641 sum1 += v[0]*xb[k]; 642 sum2 += v[1]*xb[k]; 643 sum3 += v[2]*xb[k]; 644 sum4 += v[3]*xb[k]; 645 sum5 += v[4]*xb[k]; 646 sum6 += v[5]*xb[k]; 647 sum7 += v[6]*xb[k]; 648 sum8 += v[7]*xb[k]; 649 sum9 += v[8]*xb[k]; 650 sum10 += v[9]*xb[k]; 651 sum11 += v[10]*xb[k]; 652 sum12 += v[11]*xb[k]; 653 sum13 += v[12]*xb[k]; 654 sum14 += v[13]*xb[k]; 655 sum15 += v[14]*xb[k]; 656 v += 15; 657 } 658 } 659 if (usecprow) z = zarray + 15*ridx[i]; 660 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 661 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 662 663 if (!usecprow) z += 15; 664 } 665 666 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 667 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 668 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 669 PetscFunctionReturn(0); 670 } 671 672 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 673 #undef __FUNCT__ 674 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 675 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 676 { 677 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 678 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 679 const PetscScalar *x,*xb; 680 PetscScalar x1,x2,x3,x4,*zarray; 681 const MatScalar *v; 682 PetscErrorCode ierr; 683 const PetscInt *ii,*ij=a->j,*idx; 684 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 685 PetscTruth usecprow=a->compressedrow.use; 686 687 PetscFunctionBegin; 688 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 689 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 690 691 v = a->a; 692 if (usecprow){ 693 mbs = a->compressedrow.nrows; 694 ii = a->compressedrow.i; 695 ridx = a->compressedrow.rindex; 696 } else { 697 mbs = a->mbs; 698 ii = a->i; 699 z = zarray; 700 } 701 702 for (i=0; i<mbs; i++) { 703 n = ii[i+1] - ii[i]; 704 idx = ij + ii[i]; 705 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 706 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 707 708 nonzerorow += (n>0); 709 for (j=0; j<n; j++) { 710 xb = x + 15*(idx[j]); 711 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 712 713 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 714 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 715 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 716 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 717 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 718 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 719 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 720 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 721 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 722 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 723 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 724 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 725 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 726 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 727 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 728 729 v += 60; 730 731 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 732 733 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 734 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 735 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 736 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 737 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 738 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 739 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 740 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 741 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 742 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 743 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 744 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 745 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 746 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 747 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 748 v += 60; 749 750 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 751 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 752 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 753 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 754 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 755 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 756 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 757 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 758 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 759 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 760 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 761 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 762 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 763 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 764 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 765 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 766 v += 60; 767 768 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 769 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 770 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 771 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 772 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 773 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 774 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 775 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 776 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 777 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 778 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 779 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 780 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 781 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 782 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 783 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 784 v += 45; 785 } 786 if (usecprow) z = zarray + 15*ridx[i]; 787 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 788 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 789 790 if (!usecprow) z += 15; 791 } 792 793 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 794 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 795 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 796 PetscFunctionReturn(0); 797 } 798 799 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 800 #undef __FUNCT__ 801 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 802 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 803 { 804 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 805 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 806 const PetscScalar *x,*xb; 807 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 808 const MatScalar *v; 809 PetscErrorCode ierr; 810 const PetscInt *ii,*ij=a->j,*idx; 811 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 812 PetscTruth usecprow=a->compressedrow.use; 813 814 PetscFunctionBegin; 815 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 816 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 817 818 v = a->a; 819 if (usecprow){ 820 mbs = a->compressedrow.nrows; 821 ii = a->compressedrow.i; 822 ridx = a->compressedrow.rindex; 823 } else { 824 mbs = a->mbs; 825 ii = a->i; 826 z = zarray; 827 } 828 829 for (i=0; i<mbs; i++) { 830 n = ii[i+1] - ii[i]; 831 idx = ij + ii[i]; 832 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 833 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 834 835 nonzerorow += (n>0); 836 for (j=0; j<n; j++) { 837 xb = x + 15*(idx[j]); 838 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 839 x8 = xb[7]; 840 841 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 842 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 843 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 844 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 845 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 846 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 847 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 848 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 849 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 850 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 851 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 852 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 853 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 854 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 855 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 856 v += 120; 857 858 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 859 860 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 861 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 862 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 863 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 864 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 865 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 866 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 867 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 868 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 869 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 870 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 871 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 872 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 873 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 874 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 875 v += 105; 876 } 877 if (usecprow) z = zarray + 15*ridx[i]; 878 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 879 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 880 881 if (!usecprow) z += 15; 882 } 883 884 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 885 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 886 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 887 PetscFunctionReturn(0); 888 } 889 890 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 891 892 #undef __FUNCT__ 893 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 894 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 895 { 896 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 897 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 898 const PetscScalar *x,*xb; 899 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 900 const MatScalar *v; 901 PetscErrorCode ierr; 902 const PetscInt *ii,*ij=a->j,*idx; 903 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 904 PetscTruth usecprow=a->compressedrow.use; 905 906 PetscFunctionBegin; 907 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 908 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 909 910 v = a->a; 911 if (usecprow){ 912 mbs = a->compressedrow.nrows; 913 ii = a->compressedrow.i; 914 ridx = a->compressedrow.rindex; 915 } else { 916 mbs = a->mbs; 917 ii = a->i; 918 z = zarray; 919 } 920 921 for (i=0; i<mbs; i++) { 922 n = ii[i+1] - ii[i]; 923 idx = ij + ii[i]; 924 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 925 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 926 927 nonzerorow += (n>0); 928 for (j=0; j<n; j++) { 929 xb = x + 15*(idx[j]); 930 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 931 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 932 933 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 934 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 935 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 936 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 937 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 938 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 939 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 940 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 941 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 942 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 943 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 944 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 945 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 946 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 947 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 948 v += 225; 949 } 950 if (usecprow) z = zarray + 15*ridx[i]; 951 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 952 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 953 954 if (!usecprow) z += 15; 955 } 956 957 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 958 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 959 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 960 PetscFunctionReturn(0); 961 } 962 963 964 /* 965 This will not work with MatScalar == float because it calls the BLAS 966 */ 967 #undef __FUNCT__ 968 #define __FUNCT__ "MatMult_SeqBAIJ_N" 969 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 970 { 971 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 972 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 973 MatScalar *v; 974 PetscErrorCode ierr; 975 PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 976 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 977 PetscTruth usecprow=a->compressedrow.use; 978 979 PetscFunctionBegin; 980 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 981 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 982 983 idx = a->j; 984 v = a->a; 985 if (usecprow){ 986 mbs = a->compressedrow.nrows; 987 ii = a->compressedrow.i; 988 ridx = a->compressedrow.rindex; 989 } else { 990 mbs = a->mbs; 991 ii = a->i; 992 z = zarray; 993 } 994 995 if (!a->mult_work) { 996 k = PetscMax(A->rmap->n,A->cmap->n); 997 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 998 } 999 work = a->mult_work; 1000 for (i=0; i<mbs; i++) { 1001 n = ii[1] - ii[0]; ii++; 1002 ncols = n*bs; 1003 workt = work; 1004 nonzerorow += (n>0); 1005 for (j=0; j<n; j++) { 1006 xb = x + bs*(*idx++); 1007 for (k=0; k<bs; k++) workt[k] = xb[k]; 1008 workt += bs; 1009 } 1010 if (usecprow) z = zarray + bs*ridx[i]; 1011 Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1012 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1013 v += n*bs2; 1014 if (!usecprow) z += bs; 1015 } 1016 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1017 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1018 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1019 PetscFunctionReturn(0); 1020 } 1021 1022 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 1023 #undef __FUNCT__ 1024 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1025 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1026 { 1027 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1028 const PetscScalar *x; 1029 PetscScalar *y,*z,sum; 1030 const MatScalar *v; 1031 PetscErrorCode ierr; 1032 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1033 const PetscInt *idx,*ii; 1034 PetscTruth usecprow=a->compressedrow.use; 1035 1036 PetscFunctionBegin; 1037 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 1038 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1039 if (zz != yy) { 1040 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1041 } else { 1042 z = y; 1043 } 1044 1045 idx = a->j; 1046 v = a->a; 1047 if (usecprow){ 1048 if (zz != yy){ 1049 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1050 } 1051 mbs = a->compressedrow.nrows; 1052 ii = a->compressedrow.i; 1053 ridx = a->compressedrow.rindex; 1054 } else { 1055 ii = a->i; 1056 } 1057 1058 for (i=0; i<mbs; i++) { 1059 n = ii[1] - ii[0]; 1060 ii++; 1061 if (!usecprow){ 1062 nonzerorow += (n>0); 1063 sum = y[i]; 1064 } else { 1065 sum = y[ridx[i]]; 1066 } 1067 PetscSparseDensePlusDot(sum,x,v,idx,n); 1068 v += n; 1069 idx += n; 1070 if (usecprow){ 1071 z[ridx[i]] = sum; 1072 } else { 1073 z[i] = sum; 1074 } 1075 } 1076 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 1077 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1078 if (zz != yy) { 1079 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1080 } 1081 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1082 PetscFunctionReturn(0); 1083 } 1084 1085 #undef __FUNCT__ 1086 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1087 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1088 { 1089 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1090 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1091 PetscScalar x1,x2,*yarray,*zarray; 1092 MatScalar *v; 1093 PetscErrorCode ierr; 1094 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1095 PetscTruth usecprow=a->compressedrow.use; 1096 1097 PetscFunctionBegin; 1098 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1099 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1100 if (zz != yy) { 1101 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1102 } else { 1103 zarray = yarray; 1104 } 1105 1106 idx = a->j; 1107 v = a->a; 1108 if (usecprow){ 1109 if (zz != yy){ 1110 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1111 } 1112 mbs = a->compressedrow.nrows; 1113 ii = a->compressedrow.i; 1114 ridx = a->compressedrow.rindex; 1115 if (zz != yy){ 1116 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1117 } 1118 } else { 1119 ii = a->i; 1120 y = yarray; 1121 z = zarray; 1122 } 1123 1124 for (i=0; i<mbs; i++) { 1125 n = ii[1] - ii[0]; ii++; 1126 if (usecprow){ 1127 z = zarray + 2*ridx[i]; 1128 y = yarray + 2*ridx[i]; 1129 } 1130 sum1 = y[0]; sum2 = y[1]; 1131 for (j=0; j<n; j++) { 1132 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1133 sum1 += v[0]*x1 + v[2]*x2; 1134 sum2 += v[1]*x1 + v[3]*x2; 1135 v += 4; 1136 } 1137 z[0] = sum1; z[1] = sum2; 1138 if (!usecprow){ 1139 z += 2; y += 2; 1140 } 1141 } 1142 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1143 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1144 if (zz != yy) { 1145 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1146 } 1147 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 #undef __FUNCT__ 1152 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1153 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1154 { 1155 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1156 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1157 MatScalar *v; 1158 PetscErrorCode ierr; 1159 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1160 PetscTruth usecprow=a->compressedrow.use; 1161 1162 PetscFunctionBegin; 1163 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1164 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1165 if (zz != yy) { 1166 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1167 } else { 1168 zarray = yarray; 1169 } 1170 1171 idx = a->j; 1172 v = a->a; 1173 if (usecprow){ 1174 if (zz != yy){ 1175 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1176 } 1177 mbs = a->compressedrow.nrows; 1178 ii = a->compressedrow.i; 1179 ridx = a->compressedrow.rindex; 1180 } else { 1181 ii = a->i; 1182 y = yarray; 1183 z = zarray; 1184 } 1185 1186 for (i=0; i<mbs; i++) { 1187 n = ii[1] - ii[0]; ii++; 1188 if (usecprow){ 1189 z = zarray + 3*ridx[i]; 1190 y = yarray + 3*ridx[i]; 1191 } 1192 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1193 for (j=0; j<n; j++) { 1194 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1195 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1196 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1197 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1198 v += 9; 1199 } 1200 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1201 if (!usecprow){ 1202 z += 3; y += 3; 1203 } 1204 } 1205 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1206 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1207 if (zz != yy) { 1208 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1209 } 1210 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1211 PetscFunctionReturn(0); 1212 } 1213 1214 #undef __FUNCT__ 1215 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1216 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1217 { 1218 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1219 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1220 MatScalar *v; 1221 PetscErrorCode ierr; 1222 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1223 PetscTruth usecprow=a->compressedrow.use; 1224 1225 PetscFunctionBegin; 1226 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1227 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1228 if (zz != yy) { 1229 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1230 } else { 1231 zarray = yarray; 1232 } 1233 1234 idx = a->j; 1235 v = a->a; 1236 if (usecprow){ 1237 if (zz != yy){ 1238 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1239 } 1240 mbs = a->compressedrow.nrows; 1241 ii = a->compressedrow.i; 1242 ridx = a->compressedrow.rindex; 1243 } else { 1244 ii = a->i; 1245 y = yarray; 1246 z = zarray; 1247 } 1248 1249 for (i=0; i<mbs; i++) { 1250 n = ii[1] - ii[0]; ii++; 1251 if (usecprow){ 1252 z = zarray + 4*ridx[i]; 1253 y = yarray + 4*ridx[i]; 1254 } 1255 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1256 for (j=0; j<n; j++) { 1257 xb = x + 4*(*idx++); 1258 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1259 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1260 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1261 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1262 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1263 v += 16; 1264 } 1265 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1266 if (!usecprow){ 1267 z += 4; y += 4; 1268 } 1269 } 1270 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1271 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1272 if (zz != yy) { 1273 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1274 } 1275 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1276 PetscFunctionReturn(0); 1277 } 1278 1279 #undef __FUNCT__ 1280 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1281 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1282 { 1283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1284 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1285 PetscScalar *yarray,*zarray; 1286 MatScalar *v; 1287 PetscErrorCode ierr; 1288 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1289 PetscTruth usecprow=a->compressedrow.use; 1290 1291 PetscFunctionBegin; 1292 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1293 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1294 if (zz != yy) { 1295 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1296 } else { 1297 zarray = yarray; 1298 } 1299 1300 idx = a->j; 1301 v = a->a; 1302 if (usecprow){ 1303 if (zz != yy){ 1304 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1305 } 1306 mbs = a->compressedrow.nrows; 1307 ii = a->compressedrow.i; 1308 ridx = a->compressedrow.rindex; 1309 } else { 1310 ii = a->i; 1311 y = yarray; 1312 z = zarray; 1313 } 1314 1315 for (i=0; i<mbs; i++) { 1316 n = ii[1] - ii[0]; ii++; 1317 if (usecprow){ 1318 z = zarray + 5*ridx[i]; 1319 y = yarray + 5*ridx[i]; 1320 } 1321 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1322 for (j=0; j<n; j++) { 1323 xb = x + 5*(*idx++); 1324 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1325 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1326 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1327 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1328 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1329 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1330 v += 25; 1331 } 1332 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1333 if (!usecprow){ 1334 z += 5; y += 5; 1335 } 1336 } 1337 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1338 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1339 if (zz != yy) { 1340 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1341 } 1342 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1343 PetscFunctionReturn(0); 1344 } 1345 #undef __FUNCT__ 1346 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1347 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1348 { 1349 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1350 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1351 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1352 MatScalar *v; 1353 PetscErrorCode ierr; 1354 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1355 PetscTruth usecprow=a->compressedrow.use; 1356 1357 PetscFunctionBegin; 1358 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1359 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1360 if (zz != yy) { 1361 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1362 } else { 1363 zarray = yarray; 1364 } 1365 1366 idx = a->j; 1367 v = a->a; 1368 if (usecprow){ 1369 if (zz != yy){ 1370 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1371 } 1372 mbs = a->compressedrow.nrows; 1373 ii = a->compressedrow.i; 1374 ridx = a->compressedrow.rindex; 1375 } else { 1376 ii = a->i; 1377 y = yarray; 1378 z = zarray; 1379 } 1380 1381 for (i=0; i<mbs; i++) { 1382 n = ii[1] - ii[0]; ii++; 1383 if (usecprow){ 1384 z = zarray + 6*ridx[i]; 1385 y = yarray + 6*ridx[i]; 1386 } 1387 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1388 for (j=0; j<n; j++) { 1389 xb = x + 6*(*idx++); 1390 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1391 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1392 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1393 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1394 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1395 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1396 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1397 v += 36; 1398 } 1399 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1400 if (!usecprow){ 1401 z += 6; y += 6; 1402 } 1403 } 1404 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1405 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1406 if (zz != yy) { 1407 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1408 } 1409 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1410 PetscFunctionReturn(0); 1411 } 1412 1413 #undef __FUNCT__ 1414 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1415 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1416 { 1417 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1418 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1419 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1420 MatScalar *v; 1421 PetscErrorCode ierr; 1422 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1423 PetscTruth usecprow=a->compressedrow.use; 1424 1425 PetscFunctionBegin; 1426 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1427 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1428 if (zz != yy) { 1429 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1430 } else { 1431 zarray = yarray; 1432 } 1433 1434 idx = a->j; 1435 v = a->a; 1436 if (usecprow){ 1437 if (zz != yy){ 1438 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1439 } 1440 mbs = a->compressedrow.nrows; 1441 ii = a->compressedrow.i; 1442 ridx = a->compressedrow.rindex; 1443 } else { 1444 ii = a->i; 1445 y = yarray; 1446 z = zarray; 1447 } 1448 1449 for (i=0; i<mbs; i++) { 1450 n = ii[1] - ii[0]; ii++; 1451 if (usecprow){ 1452 z = zarray + 7*ridx[i]; 1453 y = yarray + 7*ridx[i]; 1454 } 1455 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1456 for (j=0; j<n; j++) { 1457 xb = x + 7*(*idx++); 1458 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1459 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1460 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1461 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1462 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1463 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1464 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1465 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1466 v += 49; 1467 } 1468 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1469 if (!usecprow){ 1470 z += 7; y += 7; 1471 } 1472 } 1473 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1474 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1475 if (zz != yy) { 1476 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1477 } 1478 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1479 PetscFunctionReturn(0); 1480 } 1481 1482 #undef __FUNCT__ 1483 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1484 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1485 { 1486 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1487 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1488 MatScalar *v; 1489 PetscErrorCode ierr; 1490 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1491 PetscInt ncols,k,*ridx=PETSC_NULL; 1492 PetscTruth usecprow=a->compressedrow.use; 1493 1494 PetscFunctionBegin; 1495 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1496 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1497 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1498 1499 idx = a->j; 1500 v = a->a; 1501 if (usecprow){ 1502 mbs = a->compressedrow.nrows; 1503 ii = a->compressedrow.i; 1504 ridx = a->compressedrow.rindex; 1505 } else { 1506 mbs = a->mbs; 1507 ii = a->i; 1508 z = zarray; 1509 } 1510 1511 if (!a->mult_work) { 1512 k = PetscMax(A->rmap->n,A->cmap->n); 1513 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1514 } 1515 work = a->mult_work; 1516 for (i=0; i<mbs; i++) { 1517 n = ii[1] - ii[0]; ii++; 1518 ncols = n*bs; 1519 workt = work; 1520 for (j=0; j<n; j++) { 1521 xb = x + bs*(*idx++); 1522 for (k=0; k<bs; k++) workt[k] = xb[k]; 1523 workt += bs; 1524 } 1525 if (usecprow) z = zarray + bs*ridx[i]; 1526 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1527 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1528 v += n*bs2; 1529 if (!usecprow){ 1530 z += bs; 1531 } 1532 } 1533 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1534 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1535 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1536 PetscFunctionReturn(0); 1537 } 1538 1539 #undef __FUNCT__ 1540 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1541 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1542 { 1543 PetscScalar zero = 0.0; 1544 PetscErrorCode ierr; 1545 1546 PetscFunctionBegin; 1547 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1548 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 #undef __FUNCT__ 1553 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1554 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1555 { 1556 PetscScalar zero = 0.0; 1557 PetscErrorCode ierr; 1558 1559 PetscFunctionBegin; 1560 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1561 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 #undef __FUNCT__ 1566 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1567 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1568 1569 { 1570 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1571 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1572 MatScalar *v; 1573 PetscErrorCode ierr; 1574 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1575 Mat_CompressedRow cprow = a->compressedrow; 1576 PetscTruth usecprow=cprow.use; 1577 1578 PetscFunctionBegin; 1579 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1580 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1581 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1582 1583 idx = a->j; 1584 v = a->a; 1585 if (usecprow){ 1586 mbs = cprow.nrows; 1587 ii = cprow.i; 1588 ridx = cprow.rindex; 1589 } else { 1590 mbs=a->mbs; 1591 ii = a->i; 1592 xb = x; 1593 } 1594 1595 switch (bs) { 1596 case 1: 1597 for (i=0; i<mbs; i++) { 1598 if (usecprow) xb = x + ridx[i]; 1599 x1 = xb[0]; 1600 ib = idx + ii[0]; 1601 n = ii[1] - ii[0]; ii++; 1602 for (j=0; j<n; j++) { 1603 rval = ib[j]; 1604 z[rval] += PetscConj(*v) * x1; 1605 v++; 1606 } 1607 if (!usecprow) xb++; 1608 } 1609 break; 1610 case 2: 1611 for (i=0; i<mbs; i++) { 1612 if (usecprow) xb = x + 2*ridx[i]; 1613 x1 = xb[0]; x2 = xb[1]; 1614 ib = idx + ii[0]; 1615 n = ii[1] - ii[0]; ii++; 1616 for (j=0; j<n; j++) { 1617 rval = ib[j]*2; 1618 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1619 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1620 v += 4; 1621 } 1622 if (!usecprow) xb += 2; 1623 } 1624 break; 1625 case 3: 1626 for (i=0; i<mbs; i++) { 1627 if (usecprow) xb = x + 3*ridx[i]; 1628 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1629 ib = idx + ii[0]; 1630 n = ii[1] - ii[0]; ii++; 1631 for (j=0; j<n; j++) { 1632 rval = ib[j]*3; 1633 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1634 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1635 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1636 v += 9; 1637 } 1638 if (!usecprow) xb += 3; 1639 } 1640 break; 1641 case 4: 1642 for (i=0; i<mbs; i++) { 1643 if (usecprow) xb = x + 4*ridx[i]; 1644 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1645 ib = idx + ii[0]; 1646 n = ii[1] - ii[0]; ii++; 1647 for (j=0; j<n; j++) { 1648 rval = ib[j]*4; 1649 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1650 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1651 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1652 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1653 v += 16; 1654 } 1655 if (!usecprow) xb += 4; 1656 } 1657 break; 1658 case 5: 1659 for (i=0; i<mbs; i++) { 1660 if (usecprow) xb = x + 5*ridx[i]; 1661 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1662 x4 = xb[3]; x5 = xb[4]; 1663 ib = idx + ii[0]; 1664 n = ii[1] - ii[0]; ii++; 1665 for (j=0; j<n; j++) { 1666 rval = ib[j]*5; 1667 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1668 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1669 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1670 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1671 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1672 v += 25; 1673 } 1674 if (!usecprow) xb += 5; 1675 } 1676 break; 1677 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1678 PetscInt ncols,k; 1679 PetscScalar *work,*workt,*xtmp; 1680 1681 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1682 if (!a->mult_work) { 1683 k = PetscMax(A->rmap->n,A->cmap->n); 1684 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1685 } 1686 work = a->mult_work; 1687 xtmp = x; 1688 for (i=0; i<mbs; i++) { 1689 n = ii[1] - ii[0]; ii++; 1690 ncols = n*bs; 1691 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1692 if (usecprow) { 1693 xtmp = x + bs*ridx[i]; 1694 } 1695 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1696 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1697 v += n*bs2; 1698 if (!usecprow) xtmp += bs; 1699 workt = work; 1700 for (j=0; j<n; j++) { 1701 zb = z + bs*(*idx++); 1702 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1703 workt += bs; 1704 } 1705 } 1706 } 1707 } 1708 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1709 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1710 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1711 PetscFunctionReturn(0); 1712 } 1713 1714 #undef __FUNCT__ 1715 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1716 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1717 { 1718 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1719 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1720 MatScalar *v; 1721 PetscErrorCode ierr; 1722 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1723 Mat_CompressedRow cprow = a->compressedrow; 1724 PetscTruth usecprow=cprow.use; 1725 1726 PetscFunctionBegin; 1727 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1728 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1729 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1730 1731 idx = a->j; 1732 v = a->a; 1733 if (usecprow){ 1734 mbs = cprow.nrows; 1735 ii = cprow.i; 1736 ridx = cprow.rindex; 1737 } else { 1738 mbs=a->mbs; 1739 ii = a->i; 1740 xb = x; 1741 } 1742 1743 switch (bs) { 1744 case 1: 1745 for (i=0; i<mbs; i++) { 1746 if (usecprow) xb = x + ridx[i]; 1747 x1 = xb[0]; 1748 ib = idx + ii[0]; 1749 n = ii[1] - ii[0]; ii++; 1750 for (j=0; j<n; j++) { 1751 rval = ib[j]; 1752 z[rval] += *v * x1; 1753 v++; 1754 } 1755 if (!usecprow) xb++; 1756 } 1757 break; 1758 case 2: 1759 for (i=0; i<mbs; i++) { 1760 if (usecprow) xb = x + 2*ridx[i]; 1761 x1 = xb[0]; x2 = xb[1]; 1762 ib = idx + ii[0]; 1763 n = ii[1] - ii[0]; ii++; 1764 for (j=0; j<n; j++) { 1765 rval = ib[j]*2; 1766 z[rval++] += v[0]*x1 + v[1]*x2; 1767 z[rval++] += v[2]*x1 + v[3]*x2; 1768 v += 4; 1769 } 1770 if (!usecprow) xb += 2; 1771 } 1772 break; 1773 case 3: 1774 for (i=0; i<mbs; i++) { 1775 if (usecprow) xb = x + 3*ridx[i]; 1776 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1777 ib = idx + ii[0]; 1778 n = ii[1] - ii[0]; ii++; 1779 for (j=0; j<n; j++) { 1780 rval = ib[j]*3; 1781 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1782 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1783 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1784 v += 9; 1785 } 1786 if (!usecprow) xb += 3; 1787 } 1788 break; 1789 case 4: 1790 for (i=0; i<mbs; i++) { 1791 if (usecprow) xb = x + 4*ridx[i]; 1792 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1793 ib = idx + ii[0]; 1794 n = ii[1] - ii[0]; ii++; 1795 for (j=0; j<n; j++) { 1796 rval = ib[j]*4; 1797 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1798 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1799 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1800 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1801 v += 16; 1802 } 1803 if (!usecprow) xb += 4; 1804 } 1805 break; 1806 case 5: 1807 for (i=0; i<mbs; i++) { 1808 if (usecprow) xb = x + 5*ridx[i]; 1809 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1810 x4 = xb[3]; x5 = xb[4]; 1811 ib = idx + ii[0]; 1812 n = ii[1] - ii[0]; ii++; 1813 for (j=0; j<n; j++) { 1814 rval = ib[j]*5; 1815 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1816 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1817 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1818 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1819 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1820 v += 25; 1821 } 1822 if (!usecprow) xb += 5; 1823 } 1824 break; 1825 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1826 PetscInt ncols,k; 1827 PetscScalar *work,*workt,*xtmp; 1828 1829 if (!a->mult_work) { 1830 k = PetscMax(A->rmap->n,A->cmap->n); 1831 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1832 } 1833 work = a->mult_work; 1834 xtmp = x; 1835 for (i=0; i<mbs; i++) { 1836 n = ii[1] - ii[0]; ii++; 1837 ncols = n*bs; 1838 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1839 if (usecprow) { 1840 xtmp = x + bs*ridx[i]; 1841 } 1842 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1843 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1844 v += n*bs2; 1845 if (!usecprow) xtmp += bs; 1846 workt = work; 1847 for (j=0; j<n; j++) { 1848 zb = z + bs*(*idx++); 1849 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1850 workt += bs; 1851 } 1852 } 1853 } 1854 } 1855 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1856 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1857 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1858 PetscFunctionReturn(0); 1859 } 1860 1861 #undef __FUNCT__ 1862 #define __FUNCT__ "MatScale_SeqBAIJ" 1863 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1864 { 1865 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1866 PetscInt totalnz = a->bs2*a->nz; 1867 PetscScalar oalpha = alpha; 1868 PetscErrorCode ierr; 1869 PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz); 1870 1871 PetscFunctionBegin; 1872 BLASscal_(&tnz,&oalpha,a->a,&one); 1873 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1874 PetscFunctionReturn(0); 1875 } 1876 1877 #undef __FUNCT__ 1878 #define __FUNCT__ "MatNorm_SeqBAIJ" 1879 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1880 { 1881 PetscErrorCode ierr; 1882 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1883 MatScalar *v = a->a; 1884 PetscReal sum = 0.0; 1885 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1886 1887 PetscFunctionBegin; 1888 if (type == NORM_FROBENIUS) { 1889 for (i=0; i< bs2*nz; i++) { 1890 #if defined(PETSC_USE_COMPLEX) 1891 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1892 #else 1893 sum += (*v)*(*v); v++; 1894 #endif 1895 } 1896 *norm = sqrt(sum); 1897 } else if (type == NORM_1) { /* maximum column sum */ 1898 PetscReal *tmp; 1899 PetscInt *bcol = a->j; 1900 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1901 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1902 for (i=0; i<nz; i++){ 1903 for (j=0; j<bs; j++){ 1904 k1 = bs*(*bcol) + j; /* column index */ 1905 for (k=0; k<bs; k++){ 1906 tmp[k1] += PetscAbsScalar(*v); v++; 1907 } 1908 } 1909 bcol++; 1910 } 1911 *norm = 0.0; 1912 for (j=0; j<A->cmap->n; j++) { 1913 if (tmp[j] > *norm) *norm = tmp[j]; 1914 } 1915 ierr = PetscFree(tmp);CHKERRQ(ierr); 1916 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1917 *norm = 0.0; 1918 for (k=0; k<bs; k++) { 1919 for (j=0; j<a->mbs; j++) { 1920 v = a->a + bs2*a->i[j] + k; 1921 sum = 0.0; 1922 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1923 for (k1=0; k1<bs; k1++){ 1924 sum += PetscAbsScalar(*v); 1925 v += bs; 1926 } 1927 } 1928 if (sum > *norm) *norm = sum; 1929 } 1930 } 1931 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1932 PetscFunctionReturn(0); 1933 } 1934 1935 1936 #undef __FUNCT__ 1937 #define __FUNCT__ "MatEqual_SeqBAIJ" 1938 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg) 1939 { 1940 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1941 PetscErrorCode ierr; 1942 1943 PetscFunctionBegin; 1944 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1945 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1946 *flg = PETSC_FALSE; 1947 PetscFunctionReturn(0); 1948 } 1949 1950 /* if the a->i are the same */ 1951 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1952 if (!*flg) { 1953 PetscFunctionReturn(0); 1954 } 1955 1956 /* if a->j are the same */ 1957 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1958 if (!*flg) { 1959 PetscFunctionReturn(0); 1960 } 1961 /* if a->a are the same */ 1962 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1963 PetscFunctionReturn(0); 1964 1965 } 1966 1967 #undef __FUNCT__ 1968 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1969 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1970 { 1971 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1972 PetscErrorCode ierr; 1973 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1974 PetscScalar *x,zero = 0.0; 1975 MatScalar *aa,*aa_j; 1976 1977 PetscFunctionBegin; 1978 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1979 bs = A->rmap->bs; 1980 aa = a->a; 1981 ai = a->i; 1982 aj = a->j; 1983 ambs = a->mbs; 1984 bs2 = a->bs2; 1985 1986 ierr = VecSet(v,zero);CHKERRQ(ierr); 1987 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1988 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1989 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1990 for (i=0; i<ambs; i++) { 1991 for (j=ai[i]; j<ai[i+1]; j++) { 1992 if (aj[j] == i) { 1993 row = i*bs; 1994 aa_j = aa+j*bs2; 1995 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1996 break; 1997 } 1998 } 1999 } 2000 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2001 PetscFunctionReturn(0); 2002 } 2003 2004 #undef __FUNCT__ 2005 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2006 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2007 { 2008 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2009 PetscScalar *l,*r,x,*li,*ri; 2010 MatScalar *aa,*v; 2011 PetscErrorCode ierr; 2012 PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 2013 2014 PetscFunctionBegin; 2015 ai = a->i; 2016 aj = a->j; 2017 aa = a->a; 2018 m = A->rmap->n; 2019 n = A->cmap->n; 2020 bs = A->rmap->bs; 2021 mbs = a->mbs; 2022 bs2 = a->bs2; 2023 if (ll) { 2024 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 2025 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2026 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2027 for (i=0; i<mbs; i++) { /* for each block row */ 2028 M = ai[i+1] - ai[i]; 2029 li = l + i*bs; 2030 v = aa + bs2*ai[i]; 2031 for (j=0; j<M; j++) { /* for each block */ 2032 for (k=0; k<bs2; k++) { 2033 (*v++) *= li[k%bs]; 2034 } 2035 } 2036 } 2037 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 2038 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2039 } 2040 2041 if (rr) { 2042 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 2043 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2044 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2045 for (i=0; i<mbs; i++) { /* for each block row */ 2046 M = ai[i+1] - ai[i]; 2047 v = aa + bs2*ai[i]; 2048 for (j=0; j<M; j++) { /* for each block */ 2049 ri = r + bs*aj[ai[i]+j]; 2050 for (k=0; k<bs; k++) { 2051 x = ri[k]; 2052 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 2053 } 2054 } 2055 } 2056 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 2057 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2058 } 2059 PetscFunctionReturn(0); 2060 } 2061 2062 2063 #undef __FUNCT__ 2064 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2065 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2066 { 2067 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2068 2069 PetscFunctionBegin; 2070 info->block_size = a->bs2; 2071 info->nz_allocated = a->maxnz; 2072 info->nz_used = a->bs2*a->nz; 2073 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2074 info->assemblies = A->num_ass; 2075 info->mallocs = a->reallocs; 2076 info->memory = ((PetscObject)A)->mem; 2077 if (A->factortype) { 2078 info->fill_ratio_given = A->info.fill_ratio_given; 2079 info->fill_ratio_needed = A->info.fill_ratio_needed; 2080 info->factor_mallocs = A->info.factor_mallocs; 2081 } else { 2082 info->fill_ratio_given = 0; 2083 info->fill_ratio_needed = 0; 2084 info->factor_mallocs = 0; 2085 } 2086 PetscFunctionReturn(0); 2087 } 2088 2089 2090 #undef __FUNCT__ 2091 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2092 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2093 { 2094 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2095 PetscErrorCode ierr; 2096 2097 PetscFunctionBegin; 2098 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2099 PetscFunctionReturn(0); 2100 } 2101