1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "petscblaslapack.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 15 const PetscInt *idx; 16 PetscInt start,end,*ai,*aj,bs,*nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 28 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 29 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 30 31 for (i=0; i<is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 38 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j=0; j<n ; ++j){ 42 ival = idx[j]/bs; /* convert the indices into block indices */ 43 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++){ /* for each overlap*/ 51 n = isz; 52 for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row+1]; 56 for (l = start; l<end ; l++){ 57 val = aj[l]; 58 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j=0; j<isz; j++) { 64 for (k=0; k<bs; k++) 65 nidx2[j*bs+k] = nidx[j]*bs+k; 66 } 67 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 68 } 69 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 70 ierr = PetscFree(nidx);CHKERRQ(ierr); 71 ierr = PetscFree(nidx2);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 #undef __FUNCT__ 76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 78 { 79 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 80 PetscErrorCode ierr; 81 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 82 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 83 const PetscInt *irow,*icol; 84 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 85 PetscInt *aj = a->j,*ai = a->i; 86 MatScalar *mat_a; 87 Mat C; 88 PetscTruth flag,sorted; 89 90 PetscFunctionBegin; 91 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 92 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 93 94 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 95 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 98 99 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 100 ssmap = smap; 101 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 102 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 103 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 104 /* determine lens of each row */ 105 for (i=0; i<nrows; i++) { 106 kstart = ai[irow[i]]; 107 kend = kstart + a->ilen[irow[i]]; 108 lens[i] = 0; 109 for (k=kstart; k<kend; k++) { 110 if (ssmap[aj[k]]) { 111 lens[i]++; 112 } 113 } 114 } 115 /* Create and fill new matrix */ 116 if (scall == MAT_REUSE_MATRIX) { 117 c = (Mat_SeqBAIJ *)((*B)->data); 118 119 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 120 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 121 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 122 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 123 C = *B; 124 } else { 125 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 126 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 127 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 128 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 129 } 130 c = (Mat_SeqBAIJ *)(C->data); 131 for (i=0; i<nrows; i++) { 132 row = irow[i]; 133 kstart = ai[row]; 134 kend = kstart + a->ilen[row]; 135 mat_i = c->i[i]; 136 mat_j = c->j + mat_i; 137 mat_a = c->a + mat_i*bs2; 138 mat_ilen = c->ilen + i; 139 for (k=kstart; k<kend; k++) { 140 if ((tcol=ssmap[a->j[k]])) { 141 *mat_j++ = tcol - 1; 142 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 143 mat_a += bs2; 144 (*mat_ilen)++; 145 } 146 } 147 } 148 149 /* Free work space */ 150 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 151 ierr = PetscFree(smap);CHKERRQ(ierr); 152 ierr = PetscFree(lens);CHKERRQ(ierr); 153 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 154 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 155 156 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 157 *B = C; 158 PetscFunctionReturn(0); 159 } 160 161 #undef __FUNCT__ 162 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 164 { 165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 166 IS is1,is2; 167 PetscErrorCode ierr; 168 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 169 const PetscInt *irow,*icol; 170 171 PetscFunctionBegin; 172 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 173 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 176 177 /* Verify if the indices corespond to each element in a block 178 and form the IS with compressed IS */ 179 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 180 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 181 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 182 count = 0; 183 for (i=0; i<a->mbs; i++) { 184 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 185 if (vary[i]==bs) iary[count++] = i; 186 } 187 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 188 189 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 190 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 191 count = 0; 192 for (i=0; i<a->mbs; i++) { 193 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 194 if (vary[i]==bs) iary[count++] = i; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr); 197 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 198 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 199 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 200 201 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 202 ISDestroy(is1); 203 ISDestroy(is2); 204 PetscFunctionReturn(0); 205 } 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 209 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 210 { 211 PetscErrorCode ierr; 212 PetscInt i; 213 214 PetscFunctionBegin; 215 if (scall == MAT_INITIAL_MATRIX) { 216 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 217 } 218 219 for (i=0; i<n; i++) { 220 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 226 /* -------------------------------------------------------*/ 227 /* Should check that shapes of vectors and matrices match */ 228 /* -------------------------------------------------------*/ 229 230 #undef __FUNCT__ 231 #define __FUNCT__ "MatMult_SeqBAIJ_1" 232 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 233 { 234 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 235 PetscScalar *z,sum; 236 const PetscScalar *x; 237 const MatScalar *v; 238 PetscErrorCode ierr; 239 PetscInt mbs,i,n,nonzerorow=0; 240 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 241 PetscTruth usecprow=a->compressedrow.use; 242 243 PetscFunctionBegin; 244 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 245 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 246 247 if (usecprow){ 248 mbs = a->compressedrow.nrows; 249 ii = a->compressedrow.i; 250 ridx = a->compressedrow.rindex; 251 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 252 } else { 253 mbs = a->mbs; 254 ii = a->i; 255 } 256 257 for (i=0; i<mbs; i++) { 258 n = ii[1] - ii[0]; 259 v = a->a + ii[0]; 260 idx = a->j + ii[0]; 261 ii++; 262 sum = 0.0; 263 PetscSparseDensePlusDot(sum,x,v,idx,n); 264 if (usecprow){ 265 z[ridx[i]] = sum; 266 } else { 267 nonzerorow += (n>0); 268 z[i] = sum; 269 } 270 } 271 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 272 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 273 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatMult_SeqBAIJ_2" 279 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 282 PetscScalar *z = 0,sum1,sum2,*zarray; 283 const PetscScalar *x,*xb; 284 PetscScalar x1,x2; 285 const MatScalar *v; 286 PetscErrorCode ierr; 287 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 288 PetscTruth usecprow=a->compressedrow.use; 289 290 PetscFunctionBegin; 291 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 292 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 293 294 idx = a->j; 295 v = a->a; 296 if (usecprow){ 297 mbs = a->compressedrow.nrows; 298 ii = a->compressedrow.i; 299 ridx = a->compressedrow.rindex; 300 } else { 301 mbs = a->mbs; 302 ii = a->i; 303 z = zarray; 304 } 305 306 for (i=0; i<mbs; i++) { 307 n = ii[1] - ii[0]; ii++; 308 sum1 = 0.0; sum2 = 0.0; 309 nonzerorow += (n>0); 310 for (j=0; j<n; j++) { 311 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 312 sum1 += v[0]*x1 + v[2]*x2; 313 sum2 += v[1]*x1 + v[3]*x2; 314 v += 4; 315 } 316 if (usecprow) z = zarray + 2*ridx[i]; 317 z[0] = sum1; z[1] = sum2; 318 if (!usecprow) z += 2; 319 } 320 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 321 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 322 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 #undef __FUNCT__ 327 #define __FUNCT__ "MatMult_SeqBAIJ_3" 328 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 329 { 330 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 331 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 332 const PetscScalar *x,*xb; 333 const MatScalar *v; 334 PetscErrorCode ierr; 335 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 336 PetscTruth usecprow=a->compressedrow.use; 337 338 339 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 340 #pragma disjoint(*v,*z,*xb) 341 #endif 342 343 PetscFunctionBegin; 344 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 345 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 346 347 idx = a->j; 348 v = a->a; 349 if (usecprow){ 350 mbs = a->compressedrow.nrows; 351 ii = a->compressedrow.i; 352 ridx = a->compressedrow.rindex; 353 } else { 354 mbs = a->mbs; 355 ii = a->i; 356 z = zarray; 357 } 358 359 for (i=0; i<mbs; i++) { 360 n = ii[1] - ii[0]; ii++; 361 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 362 nonzerorow += (n>0); 363 for (j=0; j<n; j++) { 364 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 365 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 366 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 367 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 368 v += 9; 369 } 370 if (usecprow) z = zarray + 3*ridx[i]; 371 z[0] = sum1; z[1] = sum2; z[2] = sum3; 372 if (!usecprow) z += 3; 373 } 374 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 375 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 376 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 #undef __FUNCT__ 381 #define __FUNCT__ "MatMult_SeqBAIJ_4" 382 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 383 { 384 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 385 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 386 const PetscScalar *x,*xb; 387 const MatScalar *v; 388 PetscErrorCode ierr; 389 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 390 PetscTruth usecprow=a->compressedrow.use; 391 392 PetscFunctionBegin; 393 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 394 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 395 396 idx = a->j; 397 v = a->a; 398 if (usecprow){ 399 mbs = a->compressedrow.nrows; 400 ii = a->compressedrow.i; 401 ridx = a->compressedrow.rindex; 402 } else { 403 mbs = a->mbs; 404 ii = a->i; 405 z = zarray; 406 } 407 408 for (i=0; i<mbs; i++) { 409 n = ii[1] - ii[0]; ii++; 410 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 411 nonzerorow += (n>0); 412 for (j=0; j<n; j++) { 413 xb = x + 4*(*idx++); 414 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 415 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 416 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 417 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 418 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 419 v += 16; 420 } 421 if (usecprow) z = zarray + 4*ridx[i]; 422 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 423 if (!usecprow) z += 4; 424 } 425 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 426 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 427 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 #undef __FUNCT__ 432 #define __FUNCT__ "MatMult_SeqBAIJ_5" 433 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 434 { 435 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 436 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 437 const PetscScalar *xb,*x; 438 const MatScalar *v; 439 PetscErrorCode ierr; 440 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 441 PetscInt mbs,i,j,n,nonzerorow=0; 442 PetscTruth usecprow=a->compressedrow.use; 443 444 PetscFunctionBegin; 445 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 446 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 447 448 idx = a->j; 449 v = a->a; 450 if (usecprow){ 451 mbs = a->compressedrow.nrows; 452 ii = a->compressedrow.i; 453 ridx = a->compressedrow.rindex; 454 } else { 455 mbs = a->mbs; 456 ii = a->i; 457 z = zarray; 458 } 459 460 for (i=0; i<mbs; i++) { 461 n = ii[1] - ii[0]; ii++; 462 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 463 nonzerorow += (n>0); 464 for (j=0; j<n; j++) { 465 xb = x + 5*(*idx++); 466 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 467 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 468 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 469 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 470 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 471 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 472 v += 25; 473 } 474 if (usecprow) z = zarray + 5*ridx[i]; 475 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 476 if (!usecprow) z += 5; 477 } 478 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 479 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 480 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 481 PetscFunctionReturn(0); 482 } 483 484 485 #undef __FUNCT__ 486 #define __FUNCT__ "MatMult_SeqBAIJ_6" 487 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 488 { 489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 490 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 491 const PetscScalar *x,*xb; 492 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 493 const MatScalar *v; 494 PetscErrorCode ierr; 495 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 496 PetscTruth usecprow=a->compressedrow.use; 497 498 PetscFunctionBegin; 499 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 500 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 501 502 idx = a->j; 503 v = a->a; 504 if (usecprow){ 505 mbs = a->compressedrow.nrows; 506 ii = a->compressedrow.i; 507 ridx = a->compressedrow.rindex; 508 } else { 509 mbs = a->mbs; 510 ii = a->i; 511 z = zarray; 512 } 513 514 for (i=0; i<mbs; i++) { 515 n = ii[1] - ii[0]; ii++; 516 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 517 nonzerorow += (n>0); 518 for (j=0; j<n; j++) { 519 xb = x + 6*(*idx++); 520 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 521 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 522 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 523 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 524 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 525 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 526 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 527 v += 36; 528 } 529 if (usecprow) z = zarray + 6*ridx[i]; 530 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 531 if (!usecprow) z += 6; 532 } 533 534 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 535 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 536 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 #undef __FUNCT__ 541 #define __FUNCT__ "MatMult_SeqBAIJ_7" 542 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 543 { 544 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 545 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 546 const PetscScalar *x,*xb; 547 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 548 const MatScalar *v; 549 PetscErrorCode ierr; 550 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 551 PetscTruth usecprow=a->compressedrow.use; 552 553 PetscFunctionBegin; 554 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 555 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 556 557 idx = a->j; 558 v = a->a; 559 if (usecprow){ 560 mbs = a->compressedrow.nrows; 561 ii = a->compressedrow.i; 562 ridx = a->compressedrow.rindex; 563 } else { 564 mbs = a->mbs; 565 ii = a->i; 566 z = zarray; 567 } 568 569 for (i=0; i<mbs; i++) { 570 n = ii[1] - ii[0]; ii++; 571 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 572 nonzerorow += (n>0); 573 for (j=0; j<n; j++) { 574 xb = x + 7*(*idx++); 575 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 576 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 577 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 578 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 579 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 580 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 581 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 582 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 583 v += 49; 584 } 585 if (usecprow) z = zarray + 7*ridx[i]; 586 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 587 if (!usecprow) z += 7; 588 } 589 590 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 591 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 592 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 593 PetscFunctionReturn(0); 594 } 595 596 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 597 /* Default MatMult for block size 15 */ 598 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 601 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 602 { 603 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 604 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 605 const PetscScalar *x,*xb; 606 PetscScalar *zarray,xv; 607 const MatScalar *v; 608 PetscErrorCode ierr; 609 const PetscInt *ii,*ij=a->j,*idx; 610 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 611 PetscTruth usecprow=a->compressedrow.use; 612 613 PetscFunctionBegin; 614 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 615 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 616 617 v = a->a; 618 if (usecprow){ 619 mbs = a->compressedrow.nrows; 620 ii = a->compressedrow.i; 621 ridx = a->compressedrow.rindex; 622 } else { 623 mbs = a->mbs; 624 ii = a->i; 625 z = zarray; 626 } 627 628 for (i=0; i<mbs; i++) { 629 n = ii[i+1] - ii[i]; 630 idx = ij + ii[i]; 631 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 632 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 633 634 nonzerorow += (n>0); 635 for (j=0; j<n; j++) { 636 xb = x + 15*(idx[j]); 637 638 for(k=0;k<15;k++){ 639 xv = xb[k]; 640 sum1 += v[0]*xv; 641 sum2 += v[1]*xv; 642 sum3 += v[2]*xv; 643 sum4 += v[3]*xv; 644 sum5 += v[4]*xv; 645 sum6 += v[5]*xv; 646 sum7 += v[6]*xv; 647 sum8 += v[7]*xv; 648 sum9 += v[8]*xv; 649 sum10 += v[9]*xv; 650 sum11 += v[10]*xv; 651 sum12 += v[11]*xv; 652 sum13 += v[12]*xv; 653 sum14 += v[13]*xv; 654 sum15 += v[14]*xv; 655 v += 15; 656 } 657 } 658 if (usecprow) z = zarray + 15*ridx[i]; 659 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 660 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 661 662 if (!usecprow) z += 15; 663 } 664 665 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 666 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 667 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 668 PetscFunctionReturn(0); 669 } 670 671 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 672 #undef __FUNCT__ 673 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 674 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 675 { 676 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 677 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 678 const PetscScalar *x,*xb; 679 PetscScalar x1,x2,x3,x4,*zarray; 680 const MatScalar *v; 681 PetscErrorCode ierr; 682 const PetscInt *ii,*ij=a->j,*idx; 683 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 684 PetscTruth usecprow=a->compressedrow.use; 685 686 PetscFunctionBegin; 687 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 688 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 689 690 v = a->a; 691 if (usecprow){ 692 mbs = a->compressedrow.nrows; 693 ii = a->compressedrow.i; 694 ridx = a->compressedrow.rindex; 695 } else { 696 mbs = a->mbs; 697 ii = a->i; 698 z = zarray; 699 } 700 701 for (i=0; i<mbs; i++) { 702 n = ii[i+1] - ii[i]; 703 idx = ij + ii[i]; 704 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 705 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 706 707 nonzerorow += (n>0); 708 for (j=0; j<n; j++) { 709 xb = x + 15*(idx[j]); 710 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 711 712 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 713 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 714 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 715 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 716 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 717 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 718 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 719 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 720 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 721 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 722 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 723 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 724 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 725 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 726 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 727 728 v += 60; 729 730 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 731 732 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 733 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 734 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 735 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 736 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 737 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 738 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 739 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 740 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 741 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 742 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 743 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 744 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 745 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 746 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 747 v += 60; 748 749 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 750 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 751 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 752 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 753 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 754 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 755 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 756 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 757 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 758 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 759 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 760 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 761 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 762 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 763 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 764 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 765 v += 60; 766 767 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 768 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 769 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 770 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 771 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 772 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 773 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 774 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 775 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 776 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 777 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 778 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 779 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 780 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 781 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 782 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 783 v += 45; 784 } 785 if (usecprow) z = zarray + 15*ridx[i]; 786 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 787 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 788 789 if (!usecprow) z += 15; 790 } 791 792 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 793 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 794 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 795 PetscFunctionReturn(0); 796 } 797 798 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 799 #undef __FUNCT__ 800 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 801 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 802 { 803 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 804 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 805 const PetscScalar *x,*xb; 806 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 807 const MatScalar *v; 808 PetscErrorCode ierr; 809 const PetscInt *ii,*ij=a->j,*idx; 810 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 811 PetscTruth usecprow=a->compressedrow.use; 812 813 PetscFunctionBegin; 814 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 815 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 816 817 v = a->a; 818 if (usecprow){ 819 mbs = a->compressedrow.nrows; 820 ii = a->compressedrow.i; 821 ridx = a->compressedrow.rindex; 822 } else { 823 mbs = a->mbs; 824 ii = a->i; 825 z = zarray; 826 } 827 828 for (i=0; i<mbs; i++) { 829 n = ii[i+1] - ii[i]; 830 idx = ij + ii[i]; 831 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 832 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 833 834 nonzerorow += (n>0); 835 for (j=0; j<n; j++) { 836 xb = x + 15*(idx[j]); 837 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 838 x8 = xb[7]; 839 840 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 841 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 842 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 843 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 844 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 845 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 846 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 847 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 848 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 849 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 850 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 851 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 852 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 853 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 854 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 855 v += 120; 856 857 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 858 859 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 860 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 861 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 862 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 863 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 864 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 865 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 866 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 867 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 868 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 869 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 870 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 871 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 872 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 873 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 874 v += 105; 875 } 876 if (usecprow) z = zarray + 15*ridx[i]; 877 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 878 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 879 880 if (!usecprow) z += 15; 881 } 882 883 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 884 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 885 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 886 PetscFunctionReturn(0); 887 } 888 889 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 890 891 #undef __FUNCT__ 892 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 893 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 894 { 895 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 896 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 897 const PetscScalar *x,*xb; 898 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 899 const MatScalar *v; 900 PetscErrorCode ierr; 901 const PetscInt *ii,*ij=a->j,*idx; 902 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 903 PetscTruth usecprow=a->compressedrow.use; 904 905 PetscFunctionBegin; 906 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 907 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 908 909 v = a->a; 910 if (usecprow){ 911 mbs = a->compressedrow.nrows; 912 ii = a->compressedrow.i; 913 ridx = a->compressedrow.rindex; 914 } else { 915 mbs = a->mbs; 916 ii = a->i; 917 z = zarray; 918 } 919 920 for (i=0; i<mbs; i++) { 921 n = ii[i+1] - ii[i]; 922 idx = ij + ii[i]; 923 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 924 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 925 926 nonzerorow += (n>0); 927 for (j=0; j<n; j++) { 928 xb = x + 15*(idx[j]); 929 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 930 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 931 932 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 933 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 934 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 935 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 936 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 937 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 938 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 939 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 940 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 941 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 942 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 943 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 944 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 945 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 946 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 947 v += 225; 948 } 949 if (usecprow) z = zarray + 15*ridx[i]; 950 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 951 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 952 953 if (!usecprow) z += 15; 954 } 955 956 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 957 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 958 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 959 PetscFunctionReturn(0); 960 } 961 962 963 /* 964 This will not work with MatScalar == float because it calls the BLAS 965 */ 966 #undef __FUNCT__ 967 #define __FUNCT__ "MatMult_SeqBAIJ_N" 968 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 969 { 970 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 971 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 972 MatScalar *v; 973 PetscErrorCode ierr; 974 PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 975 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 976 PetscTruth usecprow=a->compressedrow.use; 977 978 PetscFunctionBegin; 979 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 980 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 981 982 idx = a->j; 983 v = a->a; 984 if (usecprow){ 985 mbs = a->compressedrow.nrows; 986 ii = a->compressedrow.i; 987 ridx = a->compressedrow.rindex; 988 } else { 989 mbs = a->mbs; 990 ii = a->i; 991 z = zarray; 992 } 993 994 if (!a->mult_work) { 995 k = PetscMax(A->rmap->n,A->cmap->n); 996 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 997 } 998 work = a->mult_work; 999 for (i=0; i<mbs; i++) { 1000 n = ii[1] - ii[0]; ii++; 1001 ncols = n*bs; 1002 workt = work; 1003 nonzerorow += (n>0); 1004 for (j=0; j<n; j++) { 1005 xb = x + bs*(*idx++); 1006 for (k=0; k<bs; k++) workt[k] = xb[k]; 1007 workt += bs; 1008 } 1009 if (usecprow) z = zarray + bs*ridx[i]; 1010 Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1011 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1012 v += n*bs2; 1013 if (!usecprow) z += bs; 1014 } 1015 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1016 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1017 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1018 PetscFunctionReturn(0); 1019 } 1020 1021 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 1022 #undef __FUNCT__ 1023 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1024 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1025 { 1026 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1027 const PetscScalar *x; 1028 PetscScalar *y,*z,sum; 1029 const MatScalar *v; 1030 PetscErrorCode ierr; 1031 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1032 const PetscInt *idx,*ii; 1033 PetscTruth usecprow=a->compressedrow.use; 1034 1035 PetscFunctionBegin; 1036 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1037 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1038 if (zz != yy) { 1039 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1040 } else { 1041 z = y; 1042 } 1043 1044 idx = a->j; 1045 v = a->a; 1046 if (usecprow){ 1047 if (zz != yy){ 1048 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1049 } 1050 mbs = a->compressedrow.nrows; 1051 ii = a->compressedrow.i; 1052 ridx = a->compressedrow.rindex; 1053 } else { 1054 ii = a->i; 1055 } 1056 1057 for (i=0; i<mbs; i++) { 1058 n = ii[1] - ii[0]; 1059 ii++; 1060 if (!usecprow){ 1061 nonzerorow += (n>0); 1062 sum = y[i]; 1063 } else { 1064 sum = y[ridx[i]]; 1065 } 1066 PetscSparseDensePlusDot(sum,x,v,idx,n); 1067 v += n; 1068 idx += n; 1069 if (usecprow){ 1070 z[ridx[i]] = sum; 1071 } else { 1072 z[i] = sum; 1073 } 1074 } 1075 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1076 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1077 if (zz != yy) { 1078 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1079 } 1080 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1081 PetscFunctionReturn(0); 1082 } 1083 1084 #undef __FUNCT__ 1085 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1086 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1087 { 1088 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1089 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1090 PetscScalar x1,x2,*yarray,*zarray; 1091 MatScalar *v; 1092 PetscErrorCode ierr; 1093 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1094 PetscTruth usecprow=a->compressedrow.use; 1095 1096 PetscFunctionBegin; 1097 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1098 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1099 if (zz != yy) { 1100 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1101 } else { 1102 zarray = yarray; 1103 } 1104 1105 idx = a->j; 1106 v = a->a; 1107 if (usecprow){ 1108 if (zz != yy){ 1109 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1110 } 1111 mbs = a->compressedrow.nrows; 1112 ii = a->compressedrow.i; 1113 ridx = a->compressedrow.rindex; 1114 if (zz != yy){ 1115 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1116 } 1117 } else { 1118 ii = a->i; 1119 y = yarray; 1120 z = zarray; 1121 } 1122 1123 for (i=0; i<mbs; i++) { 1124 n = ii[1] - ii[0]; ii++; 1125 if (usecprow){ 1126 z = zarray + 2*ridx[i]; 1127 y = yarray + 2*ridx[i]; 1128 } 1129 sum1 = y[0]; sum2 = y[1]; 1130 for (j=0; j<n; j++) { 1131 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1132 sum1 += v[0]*x1 + v[2]*x2; 1133 sum2 += v[1]*x1 + v[3]*x2; 1134 v += 4; 1135 } 1136 z[0] = sum1; z[1] = sum2; 1137 if (!usecprow){ 1138 z += 2; y += 2; 1139 } 1140 } 1141 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1142 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1143 if (zz != yy) { 1144 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1145 } 1146 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1147 PetscFunctionReturn(0); 1148 } 1149 1150 #undef __FUNCT__ 1151 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1152 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1153 { 1154 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1155 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1156 MatScalar *v; 1157 PetscErrorCode ierr; 1158 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1159 PetscTruth usecprow=a->compressedrow.use; 1160 1161 PetscFunctionBegin; 1162 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1163 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1164 if (zz != yy) { 1165 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1166 } else { 1167 zarray = yarray; 1168 } 1169 1170 idx = a->j; 1171 v = a->a; 1172 if (usecprow){ 1173 if (zz != yy){ 1174 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1175 } 1176 mbs = a->compressedrow.nrows; 1177 ii = a->compressedrow.i; 1178 ridx = a->compressedrow.rindex; 1179 } else { 1180 ii = a->i; 1181 y = yarray; 1182 z = zarray; 1183 } 1184 1185 for (i=0; i<mbs; i++) { 1186 n = ii[1] - ii[0]; ii++; 1187 if (usecprow){ 1188 z = zarray + 3*ridx[i]; 1189 y = yarray + 3*ridx[i]; 1190 } 1191 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1192 for (j=0; j<n; j++) { 1193 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1194 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1195 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1196 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1197 v += 9; 1198 } 1199 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1200 if (!usecprow){ 1201 z += 3; y += 3; 1202 } 1203 } 1204 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1205 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1206 if (zz != yy) { 1207 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1208 } 1209 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1210 PetscFunctionReturn(0); 1211 } 1212 1213 #undef __FUNCT__ 1214 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1215 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1216 { 1217 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1218 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1219 MatScalar *v; 1220 PetscErrorCode ierr; 1221 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1222 PetscTruth usecprow=a->compressedrow.use; 1223 1224 PetscFunctionBegin; 1225 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1226 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1227 if (zz != yy) { 1228 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1229 } else { 1230 zarray = yarray; 1231 } 1232 1233 idx = a->j; 1234 v = a->a; 1235 if (usecprow){ 1236 if (zz != yy){ 1237 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1238 } 1239 mbs = a->compressedrow.nrows; 1240 ii = a->compressedrow.i; 1241 ridx = a->compressedrow.rindex; 1242 } else { 1243 ii = a->i; 1244 y = yarray; 1245 z = zarray; 1246 } 1247 1248 for (i=0; i<mbs; i++) { 1249 n = ii[1] - ii[0]; ii++; 1250 if (usecprow){ 1251 z = zarray + 4*ridx[i]; 1252 y = yarray + 4*ridx[i]; 1253 } 1254 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1255 for (j=0; j<n; j++) { 1256 xb = x + 4*(*idx++); 1257 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1258 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1259 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1260 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1261 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1262 v += 16; 1263 } 1264 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1265 if (!usecprow){ 1266 z += 4; y += 4; 1267 } 1268 } 1269 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1270 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1271 if (zz != yy) { 1272 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1273 } 1274 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1275 PetscFunctionReturn(0); 1276 } 1277 1278 #undef __FUNCT__ 1279 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1280 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1281 { 1282 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1283 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1284 PetscScalar *yarray,*zarray; 1285 MatScalar *v; 1286 PetscErrorCode ierr; 1287 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1288 PetscTruth usecprow=a->compressedrow.use; 1289 1290 PetscFunctionBegin; 1291 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1292 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1293 if (zz != yy) { 1294 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1295 } else { 1296 zarray = yarray; 1297 } 1298 1299 idx = a->j; 1300 v = a->a; 1301 if (usecprow){ 1302 if (zz != yy){ 1303 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1304 } 1305 mbs = a->compressedrow.nrows; 1306 ii = a->compressedrow.i; 1307 ridx = a->compressedrow.rindex; 1308 } else { 1309 ii = a->i; 1310 y = yarray; 1311 z = zarray; 1312 } 1313 1314 for (i=0; i<mbs; i++) { 1315 n = ii[1] - ii[0]; ii++; 1316 if (usecprow){ 1317 z = zarray + 5*ridx[i]; 1318 y = yarray + 5*ridx[i]; 1319 } 1320 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1321 for (j=0; j<n; j++) { 1322 xb = x + 5*(*idx++); 1323 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1324 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1325 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1326 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1327 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1328 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1329 v += 25; 1330 } 1331 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1332 if (!usecprow){ 1333 z += 5; y += 5; 1334 } 1335 } 1336 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1337 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1338 if (zz != yy) { 1339 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1340 } 1341 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1342 PetscFunctionReturn(0); 1343 } 1344 #undef __FUNCT__ 1345 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1346 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1347 { 1348 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1349 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1350 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1351 MatScalar *v; 1352 PetscErrorCode ierr; 1353 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1354 PetscTruth usecprow=a->compressedrow.use; 1355 1356 PetscFunctionBegin; 1357 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1358 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1359 if (zz != yy) { 1360 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1361 } else { 1362 zarray = yarray; 1363 } 1364 1365 idx = a->j; 1366 v = a->a; 1367 if (usecprow){ 1368 if (zz != yy){ 1369 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1370 } 1371 mbs = a->compressedrow.nrows; 1372 ii = a->compressedrow.i; 1373 ridx = a->compressedrow.rindex; 1374 } else { 1375 ii = a->i; 1376 y = yarray; 1377 z = zarray; 1378 } 1379 1380 for (i=0; i<mbs; i++) { 1381 n = ii[1] - ii[0]; ii++; 1382 if (usecprow){ 1383 z = zarray + 6*ridx[i]; 1384 y = yarray + 6*ridx[i]; 1385 } 1386 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1387 for (j=0; j<n; j++) { 1388 xb = x + 6*(*idx++); 1389 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1390 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1391 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1392 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1393 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1394 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1395 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1396 v += 36; 1397 } 1398 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1399 if (!usecprow){ 1400 z += 6; y += 6; 1401 } 1402 } 1403 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1404 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1405 if (zz != yy) { 1406 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1407 } 1408 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1409 PetscFunctionReturn(0); 1410 } 1411 1412 #undef __FUNCT__ 1413 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1414 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1415 { 1416 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1417 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1418 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1419 MatScalar *v; 1420 PetscErrorCode ierr; 1421 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1422 PetscTruth usecprow=a->compressedrow.use; 1423 1424 PetscFunctionBegin; 1425 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1426 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1427 if (zz != yy) { 1428 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1429 } else { 1430 zarray = yarray; 1431 } 1432 1433 idx = a->j; 1434 v = a->a; 1435 if (usecprow){ 1436 if (zz != yy){ 1437 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1438 } 1439 mbs = a->compressedrow.nrows; 1440 ii = a->compressedrow.i; 1441 ridx = a->compressedrow.rindex; 1442 } else { 1443 ii = a->i; 1444 y = yarray; 1445 z = zarray; 1446 } 1447 1448 for (i=0; i<mbs; i++) { 1449 n = ii[1] - ii[0]; ii++; 1450 if (usecprow){ 1451 z = zarray + 7*ridx[i]; 1452 y = yarray + 7*ridx[i]; 1453 } 1454 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1455 for (j=0; j<n; j++) { 1456 xb = x + 7*(*idx++); 1457 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1458 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1459 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1460 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1461 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1462 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1463 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1464 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1465 v += 49; 1466 } 1467 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1468 if (!usecprow){ 1469 z += 7; y += 7; 1470 } 1471 } 1472 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1473 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1474 if (zz != yy) { 1475 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1476 } 1477 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1478 PetscFunctionReturn(0); 1479 } 1480 1481 #undef __FUNCT__ 1482 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1483 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1484 { 1485 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1486 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1487 MatScalar *v; 1488 PetscErrorCode ierr; 1489 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1490 PetscInt ncols,k,*ridx=PETSC_NULL; 1491 PetscTruth usecprow=a->compressedrow.use; 1492 1493 PetscFunctionBegin; 1494 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1495 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1496 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1497 1498 idx = a->j; 1499 v = a->a; 1500 if (usecprow){ 1501 mbs = a->compressedrow.nrows; 1502 ii = a->compressedrow.i; 1503 ridx = a->compressedrow.rindex; 1504 } else { 1505 mbs = a->mbs; 1506 ii = a->i; 1507 z = zarray; 1508 } 1509 1510 if (!a->mult_work) { 1511 k = PetscMax(A->rmap->n,A->cmap->n); 1512 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1513 } 1514 work = a->mult_work; 1515 for (i=0; i<mbs; i++) { 1516 n = ii[1] - ii[0]; ii++; 1517 ncols = n*bs; 1518 workt = work; 1519 for (j=0; j<n; j++) { 1520 xb = x + bs*(*idx++); 1521 for (k=0; k<bs; k++) workt[k] = xb[k]; 1522 workt += bs; 1523 } 1524 if (usecprow) z = zarray + bs*ridx[i]; 1525 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1526 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1527 v += n*bs2; 1528 if (!usecprow){ 1529 z += bs; 1530 } 1531 } 1532 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1533 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1534 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 #undef __FUNCT__ 1539 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1540 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1541 { 1542 PetscScalar zero = 0.0; 1543 PetscErrorCode ierr; 1544 1545 PetscFunctionBegin; 1546 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1547 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1548 PetscFunctionReturn(0); 1549 } 1550 1551 #undef __FUNCT__ 1552 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1553 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1554 { 1555 PetscScalar zero = 0.0; 1556 PetscErrorCode ierr; 1557 1558 PetscFunctionBegin; 1559 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1560 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1561 PetscFunctionReturn(0); 1562 } 1563 1564 #undef __FUNCT__ 1565 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1566 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1567 1568 { 1569 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1570 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1571 MatScalar *v; 1572 PetscErrorCode ierr; 1573 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1574 Mat_CompressedRow cprow = a->compressedrow; 1575 PetscTruth usecprow=cprow.use; 1576 1577 PetscFunctionBegin; 1578 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1579 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1580 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1581 1582 idx = a->j; 1583 v = a->a; 1584 if (usecprow){ 1585 mbs = cprow.nrows; 1586 ii = cprow.i; 1587 ridx = cprow.rindex; 1588 } else { 1589 mbs=a->mbs; 1590 ii = a->i; 1591 xb = x; 1592 } 1593 1594 switch (bs) { 1595 case 1: 1596 for (i=0; i<mbs; i++) { 1597 if (usecprow) xb = x + ridx[i]; 1598 x1 = xb[0]; 1599 ib = idx + ii[0]; 1600 n = ii[1] - ii[0]; ii++; 1601 for (j=0; j<n; j++) { 1602 rval = ib[j]; 1603 z[rval] += PetscConj(*v) * x1; 1604 v++; 1605 } 1606 if (!usecprow) xb++; 1607 } 1608 break; 1609 case 2: 1610 for (i=0; i<mbs; i++) { 1611 if (usecprow) xb = x + 2*ridx[i]; 1612 x1 = xb[0]; x2 = xb[1]; 1613 ib = idx + ii[0]; 1614 n = ii[1] - ii[0]; ii++; 1615 for (j=0; j<n; j++) { 1616 rval = ib[j]*2; 1617 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1618 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1619 v += 4; 1620 } 1621 if (!usecprow) xb += 2; 1622 } 1623 break; 1624 case 3: 1625 for (i=0; i<mbs; i++) { 1626 if (usecprow) xb = x + 3*ridx[i]; 1627 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1628 ib = idx + ii[0]; 1629 n = ii[1] - ii[0]; ii++; 1630 for (j=0; j<n; j++) { 1631 rval = ib[j]*3; 1632 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1633 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1634 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1635 v += 9; 1636 } 1637 if (!usecprow) xb += 3; 1638 } 1639 break; 1640 case 4: 1641 for (i=0; i<mbs; i++) { 1642 if (usecprow) xb = x + 4*ridx[i]; 1643 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1644 ib = idx + ii[0]; 1645 n = ii[1] - ii[0]; ii++; 1646 for (j=0; j<n; j++) { 1647 rval = ib[j]*4; 1648 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1649 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1650 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1651 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1652 v += 16; 1653 } 1654 if (!usecprow) xb += 4; 1655 } 1656 break; 1657 case 5: 1658 for (i=0; i<mbs; i++) { 1659 if (usecprow) xb = x + 5*ridx[i]; 1660 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1661 x4 = xb[3]; x5 = xb[4]; 1662 ib = idx + ii[0]; 1663 n = ii[1] - ii[0]; ii++; 1664 for (j=0; j<n; j++) { 1665 rval = ib[j]*5; 1666 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1667 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1668 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1669 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1670 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1671 v += 25; 1672 } 1673 if (!usecprow) xb += 5; 1674 } 1675 break; 1676 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1677 PetscInt ncols,k; 1678 PetscScalar *work,*workt,*xtmp; 1679 1680 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1681 if (!a->mult_work) { 1682 k = PetscMax(A->rmap->n,A->cmap->n); 1683 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1684 } 1685 work = a->mult_work; 1686 xtmp = x; 1687 for (i=0; i<mbs; i++) { 1688 n = ii[1] - ii[0]; ii++; 1689 ncols = n*bs; 1690 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1691 if (usecprow) { 1692 xtmp = x + bs*ridx[i]; 1693 } 1694 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1695 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1696 v += n*bs2; 1697 if (!usecprow) xtmp += bs; 1698 workt = work; 1699 for (j=0; j<n; j++) { 1700 zb = z + bs*(*idx++); 1701 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1702 workt += bs; 1703 } 1704 } 1705 } 1706 } 1707 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1708 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1709 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1710 PetscFunctionReturn(0); 1711 } 1712 1713 #undef __FUNCT__ 1714 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1715 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1716 { 1717 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1718 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1719 MatScalar *v; 1720 PetscErrorCode ierr; 1721 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1722 Mat_CompressedRow cprow = a->compressedrow; 1723 PetscTruth usecprow=cprow.use; 1724 1725 PetscFunctionBegin; 1726 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1727 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1728 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1729 1730 idx = a->j; 1731 v = a->a; 1732 if (usecprow){ 1733 mbs = cprow.nrows; 1734 ii = cprow.i; 1735 ridx = cprow.rindex; 1736 } else { 1737 mbs=a->mbs; 1738 ii = a->i; 1739 xb = x; 1740 } 1741 1742 switch (bs) { 1743 case 1: 1744 for (i=0; i<mbs; i++) { 1745 if (usecprow) xb = x + ridx[i]; 1746 x1 = xb[0]; 1747 ib = idx + ii[0]; 1748 n = ii[1] - ii[0]; ii++; 1749 for (j=0; j<n; j++) { 1750 rval = ib[j]; 1751 z[rval] += *v * x1; 1752 v++; 1753 } 1754 if (!usecprow) xb++; 1755 } 1756 break; 1757 case 2: 1758 for (i=0; i<mbs; i++) { 1759 if (usecprow) xb = x + 2*ridx[i]; 1760 x1 = xb[0]; x2 = xb[1]; 1761 ib = idx + ii[0]; 1762 n = ii[1] - ii[0]; ii++; 1763 for (j=0; j<n; j++) { 1764 rval = ib[j]*2; 1765 z[rval++] += v[0]*x1 + v[1]*x2; 1766 z[rval++] += v[2]*x1 + v[3]*x2; 1767 v += 4; 1768 } 1769 if (!usecprow) xb += 2; 1770 } 1771 break; 1772 case 3: 1773 for (i=0; i<mbs; i++) { 1774 if (usecprow) xb = x + 3*ridx[i]; 1775 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1776 ib = idx + ii[0]; 1777 n = ii[1] - ii[0]; ii++; 1778 for (j=0; j<n; j++) { 1779 rval = ib[j]*3; 1780 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1781 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1782 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1783 v += 9; 1784 } 1785 if (!usecprow) xb += 3; 1786 } 1787 break; 1788 case 4: 1789 for (i=0; i<mbs; i++) { 1790 if (usecprow) xb = x + 4*ridx[i]; 1791 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1792 ib = idx + ii[0]; 1793 n = ii[1] - ii[0]; ii++; 1794 for (j=0; j<n; j++) { 1795 rval = ib[j]*4; 1796 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1797 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1798 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1799 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1800 v += 16; 1801 } 1802 if (!usecprow) xb += 4; 1803 } 1804 break; 1805 case 5: 1806 for (i=0; i<mbs; i++) { 1807 if (usecprow) xb = x + 5*ridx[i]; 1808 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1809 x4 = xb[3]; x5 = xb[4]; 1810 ib = idx + ii[0]; 1811 n = ii[1] - ii[0]; ii++; 1812 for (j=0; j<n; j++) { 1813 rval = ib[j]*5; 1814 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1815 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1816 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1817 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1818 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1819 v += 25; 1820 } 1821 if (!usecprow) xb += 5; 1822 } 1823 break; 1824 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1825 PetscInt ncols,k; 1826 PetscScalar *work,*workt,*xtmp; 1827 1828 if (!a->mult_work) { 1829 k = PetscMax(A->rmap->n,A->cmap->n); 1830 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1831 } 1832 work = a->mult_work; 1833 xtmp = x; 1834 for (i=0; i<mbs; i++) { 1835 n = ii[1] - ii[0]; ii++; 1836 ncols = n*bs; 1837 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1838 if (usecprow) { 1839 xtmp = x + bs*ridx[i]; 1840 } 1841 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1842 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1843 v += n*bs2; 1844 if (!usecprow) xtmp += bs; 1845 workt = work; 1846 for (j=0; j<n; j++) { 1847 zb = z + bs*(*idx++); 1848 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1849 workt += bs; 1850 } 1851 } 1852 } 1853 } 1854 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1855 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1856 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1857 PetscFunctionReturn(0); 1858 } 1859 1860 #undef __FUNCT__ 1861 #define __FUNCT__ "MatScale_SeqBAIJ" 1862 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1863 { 1864 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1865 PetscInt totalnz = a->bs2*a->nz; 1866 PetscScalar oalpha = alpha; 1867 PetscErrorCode ierr; 1868 PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz); 1869 1870 PetscFunctionBegin; 1871 BLASscal_(&tnz,&oalpha,a->a,&one); 1872 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1873 PetscFunctionReturn(0); 1874 } 1875 1876 #undef __FUNCT__ 1877 #define __FUNCT__ "MatNorm_SeqBAIJ" 1878 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1879 { 1880 PetscErrorCode ierr; 1881 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1882 MatScalar *v = a->a; 1883 PetscReal sum = 0.0; 1884 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1885 1886 PetscFunctionBegin; 1887 if (type == NORM_FROBENIUS) { 1888 for (i=0; i< bs2*nz; i++) { 1889 #if defined(PETSC_USE_COMPLEX) 1890 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1891 #else 1892 sum += (*v)*(*v); v++; 1893 #endif 1894 } 1895 *norm = sqrt(sum); 1896 } else if (type == NORM_1) { /* maximum column sum */ 1897 PetscReal *tmp; 1898 PetscInt *bcol = a->j; 1899 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1900 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1901 for (i=0; i<nz; i++){ 1902 for (j=0; j<bs; j++){ 1903 k1 = bs*(*bcol) + j; /* column index */ 1904 for (k=0; k<bs; k++){ 1905 tmp[k1] += PetscAbsScalar(*v); v++; 1906 } 1907 } 1908 bcol++; 1909 } 1910 *norm = 0.0; 1911 for (j=0; j<A->cmap->n; j++) { 1912 if (tmp[j] > *norm) *norm = tmp[j]; 1913 } 1914 ierr = PetscFree(tmp);CHKERRQ(ierr); 1915 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1916 *norm = 0.0; 1917 for (k=0; k<bs; k++) { 1918 for (j=0; j<a->mbs; j++) { 1919 v = a->a + bs2*a->i[j] + k; 1920 sum = 0.0; 1921 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1922 for (k1=0; k1<bs; k1++){ 1923 sum += PetscAbsScalar(*v); 1924 v += bs; 1925 } 1926 } 1927 if (sum > *norm) *norm = sum; 1928 } 1929 } 1930 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1931 PetscFunctionReturn(0); 1932 } 1933 1934 1935 #undef __FUNCT__ 1936 #define __FUNCT__ "MatEqual_SeqBAIJ" 1937 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg) 1938 { 1939 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1940 PetscErrorCode ierr; 1941 1942 PetscFunctionBegin; 1943 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1944 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1945 *flg = PETSC_FALSE; 1946 PetscFunctionReturn(0); 1947 } 1948 1949 /* if the a->i are the same */ 1950 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1951 if (!*flg) { 1952 PetscFunctionReturn(0); 1953 } 1954 1955 /* if a->j are the same */ 1956 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1957 if (!*flg) { 1958 PetscFunctionReturn(0); 1959 } 1960 /* if a->a are the same */ 1961 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1962 PetscFunctionReturn(0); 1963 1964 } 1965 1966 #undef __FUNCT__ 1967 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1968 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1969 { 1970 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1971 PetscErrorCode ierr; 1972 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1973 PetscScalar *x,zero = 0.0; 1974 MatScalar *aa,*aa_j; 1975 1976 PetscFunctionBegin; 1977 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1978 bs = A->rmap->bs; 1979 aa = a->a; 1980 ai = a->i; 1981 aj = a->j; 1982 ambs = a->mbs; 1983 bs2 = a->bs2; 1984 1985 ierr = VecSet(v,zero);CHKERRQ(ierr); 1986 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1987 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1988 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1989 for (i=0; i<ambs; i++) { 1990 for (j=ai[i]; j<ai[i+1]; j++) { 1991 if (aj[j] == i) { 1992 row = i*bs; 1993 aa_j = aa+j*bs2; 1994 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1995 break; 1996 } 1997 } 1998 } 1999 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2000 PetscFunctionReturn(0); 2001 } 2002 2003 #undef __FUNCT__ 2004 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2005 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2006 { 2007 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2008 const PetscScalar *l,*r,*li,*ri; 2009 PetscScalar x; 2010 MatScalar *aa, *v; 2011 PetscErrorCode ierr; 2012 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2013 const PetscInt *ai,*aj; 2014 2015 PetscFunctionBegin; 2016 ai = a->i; 2017 aj = a->j; 2018 aa = a->a; 2019 m = A->rmap->n; 2020 n = A->cmap->n; 2021 bs = A->rmap->bs; 2022 mbs = a->mbs; 2023 bs2 = a->bs2; 2024 if (ll) { 2025 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2026 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2027 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2028 for (i=0; i<mbs; i++) { /* for each block row */ 2029 M = ai[i+1] - ai[i]; 2030 li = l + i*bs; 2031 v = aa + bs2*ai[i]; 2032 for (j=0; j<M; j++) { /* for each block */ 2033 for (k=0; k<bs2; k++) { 2034 (*v++) *= li[k%bs]; 2035 } 2036 } 2037 } 2038 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2039 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2040 } 2041 2042 if (rr) { 2043 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2044 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2045 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2046 for (i=0; i<mbs; i++) { /* for each block row */ 2047 iai = ai[i]; 2048 M = ai[i+1] - iai; 2049 v = aa + bs2*iai; 2050 for (j=0; j<M; j++) { /* for each block */ 2051 ri = r + bs*aj[iai+j]; 2052 for (k=0; k<bs; k++) { 2053 x = ri[k]; 2054 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2055 v += bs; 2056 } 2057 } 2058 } 2059 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2060 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2061 } 2062 PetscFunctionReturn(0); 2063 } 2064 2065 2066 #undef __FUNCT__ 2067 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2068 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2069 { 2070 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2071 2072 PetscFunctionBegin; 2073 info->block_size = a->bs2; 2074 info->nz_allocated = a->bs2*a->maxnz; 2075 info->nz_used = a->bs2*a->nz; 2076 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2077 info->assemblies = A->num_ass; 2078 info->mallocs = A->info.mallocs; 2079 info->memory = ((PetscObject)A)->mem; 2080 if (A->factortype) { 2081 info->fill_ratio_given = A->info.fill_ratio_given; 2082 info->fill_ratio_needed = A->info.fill_ratio_needed; 2083 info->factor_mallocs = A->info.factor_mallocs; 2084 } else { 2085 info->fill_ratio_given = 0; 2086 info->fill_ratio_needed = 0; 2087 info->factor_mallocs = 0; 2088 } 2089 PetscFunctionReturn(0); 2090 } 2091 2092 2093 #undef __FUNCT__ 2094 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2095 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2096 { 2097 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2098 PetscErrorCode ierr; 2099 2100 PetscFunctionBegin; 2101 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2102 PetscFunctionReturn(0); 2103 } 2104