1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "petscblaslapack.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 15 const PetscInt *idx; 16 PetscInt start,end,*ai,*aj,bs,*nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 28 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 29 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 30 31 for (i=0; i<is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 38 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j=0; j<n ; ++j){ 42 ival = idx[j]/bs; /* convert the indices into block indices */ 43 if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++){ /* for each overlap*/ 51 n = isz; 52 for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row+1]; 56 for (l = start; l<end ; l++){ 57 val = aj[l]; 58 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j=0; j<isz; j++) { 64 for (k=0; k<bs; k++) 65 nidx2[j*bs+k] = nidx[j]*bs+k; 66 } 67 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 68 } 69 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 70 ierr = PetscFree(nidx);CHKERRQ(ierr); 71 ierr = PetscFree(nidx2);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 #undef __FUNCT__ 76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 78 { 79 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 80 PetscErrorCode ierr; 81 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 82 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 83 const PetscInt *irow,*icol; 84 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 85 PetscInt *aj = a->j,*ai = a->i; 86 MatScalar *mat_a; 87 Mat C; 88 PetscTruth flag,sorted; 89 90 PetscFunctionBegin; 91 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 92 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 93 94 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 95 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 98 99 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 100 ssmap = smap; 101 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 102 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 103 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 104 /* determine lens of each row */ 105 for (i=0; i<nrows; i++) { 106 kstart = ai[irow[i]]; 107 kend = kstart + a->ilen[irow[i]]; 108 lens[i] = 0; 109 for (k=kstart; k<kend; k++) { 110 if (ssmap[aj[k]]) { 111 lens[i]++; 112 } 113 } 114 } 115 /* Create and fill new matrix */ 116 if (scall == MAT_REUSE_MATRIX) { 117 c = (Mat_SeqBAIJ *)((*B)->data); 118 119 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 120 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 121 if (!flag) { 122 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 123 } 124 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 125 C = *B; 126 } else { 127 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 128 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 129 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 130 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 131 } 132 c = (Mat_SeqBAIJ *)(C->data); 133 for (i=0; i<nrows; i++) { 134 row = irow[i]; 135 kstart = ai[row]; 136 kend = kstart + a->ilen[row]; 137 mat_i = c->i[i]; 138 mat_j = c->j + mat_i; 139 mat_a = c->a + mat_i*bs2; 140 mat_ilen = c->ilen + i; 141 for (k=kstart; k<kend; k++) { 142 if ((tcol=ssmap[a->j[k]])) { 143 *mat_j++ = tcol - 1; 144 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 145 mat_a += bs2; 146 (*mat_ilen)++; 147 } 148 } 149 } 150 151 /* Free work space */ 152 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 153 ierr = PetscFree(smap);CHKERRQ(ierr); 154 ierr = PetscFree(lens);CHKERRQ(ierr); 155 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 157 158 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 159 *B = C; 160 PetscFunctionReturn(0); 161 } 162 163 #undef __FUNCT__ 164 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 165 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 166 { 167 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 168 IS is1,is2; 169 PetscErrorCode ierr; 170 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 171 const PetscInt *irow,*icol; 172 173 PetscFunctionBegin; 174 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 175 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 176 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 177 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 178 179 /* Verify if the indices corespond to each element in a block 180 and form the IS with compressed IS */ 181 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 182 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 183 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 184 count = 0; 185 for (i=0; i<a->mbs; i++) { 186 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 187 if (vary[i]==bs) iary[count++] = i; 188 } 189 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 190 191 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 192 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 193 count = 0; 194 for (i=0; i<a->mbs; i++) { 195 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc"); 196 if (vary[i]==bs) iary[count++] = i; 197 } 198 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr); 199 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 200 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 201 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 202 203 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 204 ISDestroy(is1); 205 ISDestroy(is2); 206 PetscFunctionReturn(0); 207 } 208 209 #undef __FUNCT__ 210 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 211 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 212 { 213 PetscErrorCode ierr; 214 PetscInt i; 215 216 PetscFunctionBegin; 217 if (scall == MAT_INITIAL_MATRIX) { 218 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 219 } 220 221 for (i=0; i<n; i++) { 222 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 223 } 224 PetscFunctionReturn(0); 225 } 226 227 228 /* -------------------------------------------------------*/ 229 /* Should check that shapes of vectors and matrices match */ 230 /* -------------------------------------------------------*/ 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatMult_SeqBAIJ_1" 234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 235 { 236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 237 PetscScalar *z,sum; 238 const PetscScalar *x; 239 const MatScalar *v; 240 PetscErrorCode ierr; 241 PetscInt mbs,i,n,nonzerorow=0; 242 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 243 PetscTruth usecprow=a->compressedrow.use; 244 245 PetscFunctionBegin; 246 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 247 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 248 249 if (usecprow){ 250 mbs = a->compressedrow.nrows; 251 ii = a->compressedrow.i; 252 ridx = a->compressedrow.rindex; 253 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 254 } else { 255 mbs = a->mbs; 256 ii = a->i; 257 } 258 259 for (i=0; i<mbs; i++) { 260 n = ii[1] - ii[0]; 261 v = a->a + ii[0]; 262 idx = a->j + ii[0]; 263 ii++; 264 sum = 0.0; 265 PetscSparseDensePlusDot(sum,x,v,idx,n); 266 if (usecprow){ 267 z[ridx[i]] = sum; 268 } else { 269 nonzerorow += (n>0); 270 z[i] = sum; 271 } 272 } 273 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 274 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 275 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 276 PetscFunctionReturn(0); 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatMult_SeqBAIJ_2" 281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 282 { 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 284 PetscScalar *z = 0,sum1,sum2,*zarray; 285 const PetscScalar *x,*xb; 286 PetscScalar x1,x2; 287 const MatScalar *v; 288 PetscErrorCode ierr; 289 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 290 PetscTruth usecprow=a->compressedrow.use; 291 292 PetscFunctionBegin; 293 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 294 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 295 296 idx = a->j; 297 v = a->a; 298 if (usecprow){ 299 mbs = a->compressedrow.nrows; 300 ii = a->compressedrow.i; 301 ridx = a->compressedrow.rindex; 302 } else { 303 mbs = a->mbs; 304 ii = a->i; 305 z = zarray; 306 } 307 308 for (i=0; i<mbs; i++) { 309 n = ii[1] - ii[0]; ii++; 310 sum1 = 0.0; sum2 = 0.0; 311 nonzerorow += (n>0); 312 for (j=0; j<n; j++) { 313 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 314 sum1 += v[0]*x1 + v[2]*x2; 315 sum2 += v[1]*x1 + v[3]*x2; 316 v += 4; 317 } 318 if (usecprow) z = zarray + 2*ridx[i]; 319 z[0] = sum1; z[1] = sum2; 320 if (!usecprow) z += 2; 321 } 322 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 323 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 324 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 325 PetscFunctionReturn(0); 326 } 327 328 #undef __FUNCT__ 329 #define __FUNCT__ "MatMult_SeqBAIJ_3" 330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 331 { 332 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 333 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 334 const PetscScalar *x,*xb; 335 const MatScalar *v; 336 PetscErrorCode ierr; 337 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 338 PetscTruth usecprow=a->compressedrow.use; 339 340 341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 342 #pragma disjoint(*v,*z,*xb) 343 #endif 344 345 PetscFunctionBegin; 346 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 347 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 348 349 idx = a->j; 350 v = a->a; 351 if (usecprow){ 352 mbs = a->compressedrow.nrows; 353 ii = a->compressedrow.i; 354 ridx = a->compressedrow.rindex; 355 } else { 356 mbs = a->mbs; 357 ii = a->i; 358 z = zarray; 359 } 360 361 for (i=0; i<mbs; i++) { 362 n = ii[1] - ii[0]; ii++; 363 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 364 nonzerorow += (n>0); 365 for (j=0; j<n; j++) { 366 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 367 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 368 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 369 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 370 v += 9; 371 } 372 if (usecprow) z = zarray + 3*ridx[i]; 373 z[0] = sum1; z[1] = sum2; z[2] = sum3; 374 if (!usecprow) z += 3; 375 } 376 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 377 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 378 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 379 PetscFunctionReturn(0); 380 } 381 382 #undef __FUNCT__ 383 #define __FUNCT__ "MatMult_SeqBAIJ_4" 384 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 385 { 386 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 387 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 388 const PetscScalar *x,*xb; 389 const MatScalar *v; 390 PetscErrorCode ierr; 391 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 392 PetscTruth usecprow=a->compressedrow.use; 393 394 PetscFunctionBegin; 395 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 396 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 397 398 idx = a->j; 399 v = a->a; 400 if (usecprow){ 401 mbs = a->compressedrow.nrows; 402 ii = a->compressedrow.i; 403 ridx = a->compressedrow.rindex; 404 } else { 405 mbs = a->mbs; 406 ii = a->i; 407 z = zarray; 408 } 409 410 for (i=0; i<mbs; i++) { 411 n = ii[1] - ii[0]; ii++; 412 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 413 nonzerorow += (n>0); 414 for (j=0; j<n; j++) { 415 xb = x + 4*(*idx++); 416 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 417 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 418 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 419 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 420 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 421 v += 16; 422 } 423 if (usecprow) z = zarray + 4*ridx[i]; 424 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 425 if (!usecprow) z += 4; 426 } 427 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 428 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 429 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 430 PetscFunctionReturn(0); 431 } 432 433 #undef __FUNCT__ 434 #define __FUNCT__ "MatMult_SeqBAIJ_5" 435 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 436 { 437 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 438 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 439 const PetscScalar *xb,*x; 440 const MatScalar *v; 441 PetscErrorCode ierr; 442 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 443 PetscInt mbs,i,j,n,nonzerorow=0; 444 PetscTruth usecprow=a->compressedrow.use; 445 446 PetscFunctionBegin; 447 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 448 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 449 450 idx = a->j; 451 v = a->a; 452 if (usecprow){ 453 mbs = a->compressedrow.nrows; 454 ii = a->compressedrow.i; 455 ridx = a->compressedrow.rindex; 456 } else { 457 mbs = a->mbs; 458 ii = a->i; 459 z = zarray; 460 } 461 462 for (i=0; i<mbs; i++) { 463 n = ii[1] - ii[0]; ii++; 464 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 465 nonzerorow += (n>0); 466 for (j=0; j<n; j++) { 467 xb = x + 5*(*idx++); 468 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 469 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 470 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 471 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 472 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 473 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 474 v += 25; 475 } 476 if (usecprow) z = zarray + 5*ridx[i]; 477 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 478 if (!usecprow) z += 5; 479 } 480 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 481 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 482 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 487 #undef __FUNCT__ 488 #define __FUNCT__ "MatMult_SeqBAIJ_6" 489 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 490 { 491 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 492 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 493 const PetscScalar *x,*xb; 494 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 495 const MatScalar *v; 496 PetscErrorCode ierr; 497 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 498 PetscTruth usecprow=a->compressedrow.use; 499 500 PetscFunctionBegin; 501 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 502 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 503 504 idx = a->j; 505 v = a->a; 506 if (usecprow){ 507 mbs = a->compressedrow.nrows; 508 ii = a->compressedrow.i; 509 ridx = a->compressedrow.rindex; 510 } else { 511 mbs = a->mbs; 512 ii = a->i; 513 z = zarray; 514 } 515 516 for (i=0; i<mbs; i++) { 517 n = ii[1] - ii[0]; ii++; 518 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 519 nonzerorow += (n>0); 520 for (j=0; j<n; j++) { 521 xb = x + 6*(*idx++); 522 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 523 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 524 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 525 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 526 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 527 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 528 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 529 v += 36; 530 } 531 if (usecprow) z = zarray + 6*ridx[i]; 532 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 533 if (!usecprow) z += 6; 534 } 535 536 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 537 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 538 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 539 PetscFunctionReturn(0); 540 } 541 542 #undef __FUNCT__ 543 #define __FUNCT__ "MatMult_SeqBAIJ_7" 544 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 545 { 546 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 547 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 548 const PetscScalar *x,*xb; 549 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 550 const MatScalar *v; 551 PetscErrorCode ierr; 552 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 553 PetscTruth usecprow=a->compressedrow.use; 554 555 PetscFunctionBegin; 556 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 557 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 558 559 idx = a->j; 560 v = a->a; 561 if (usecprow){ 562 mbs = a->compressedrow.nrows; 563 ii = a->compressedrow.i; 564 ridx = a->compressedrow.rindex; 565 } else { 566 mbs = a->mbs; 567 ii = a->i; 568 z = zarray; 569 } 570 571 for (i=0; i<mbs; i++) { 572 n = ii[1] - ii[0]; ii++; 573 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 574 nonzerorow += (n>0); 575 for (j=0; j<n; j++) { 576 xb = x + 7*(*idx++); 577 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 578 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 579 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 580 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 581 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 582 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 583 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 584 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 585 v += 49; 586 } 587 if (usecprow) z = zarray + 7*ridx[i]; 588 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 589 if (!usecprow) z += 7; 590 } 591 592 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 593 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 594 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 595 PetscFunctionReturn(0); 596 } 597 598 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 599 /* Default MatMult for block size 15 */ 600 601 #undef __FUNCT__ 602 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 603 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 604 { 605 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 606 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 607 const PetscScalar *x,*xb; 608 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 609 const MatScalar *v; 610 PetscErrorCode ierr; 611 const PetscInt *ii,*ij=a->j,*idx; 612 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 613 PetscTruth usecprow=a->compressedrow.use; 614 615 PetscFunctionBegin; 616 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 617 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 618 619 v = a->a; 620 if (usecprow){ 621 mbs = a->compressedrow.nrows; 622 ii = a->compressedrow.i; 623 ridx = a->compressedrow.rindex; 624 } else { 625 mbs = a->mbs; 626 ii = a->i; 627 z = zarray; 628 } 629 630 for (i=0; i<mbs; i++) { 631 n = ii[i+1] - ii[i]; 632 idx = ij + ii[i]; 633 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 634 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 635 636 nonzerorow += (n>0); 637 for (j=0; j<n; j++) { 638 xb = x + 15*(idx[j]); 639 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 640 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 641 642 for(k=0;k<15;k++){ 643 sum1 += v[0]*xb[k]; 644 sum2 += v[1]*xb[k]; 645 sum3 += v[2]*xb[k]; 646 sum4 += v[3]*xb[k]; 647 sum5 += v[4]*xb[k]; 648 sum6 += v[5]*xb[k]; 649 sum7 += v[6]*xb[k]; 650 sum8 += v[7]*xb[k]; 651 sum9 += v[8]*xb[k]; 652 sum10 += v[9]*xb[k]; 653 sum11 += v[10]*xb[k]; 654 sum12 += v[11]*xb[k]; 655 sum13 += v[12]*xb[k]; 656 sum14 += v[13]*xb[k]; 657 sum15 += v[14]*xb[k]; 658 v += 15; 659 } 660 } 661 if (usecprow) z = zarray + 15*ridx[i]; 662 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 663 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 664 665 if (!usecprow) z += 15; 666 } 667 668 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 669 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 670 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 671 PetscFunctionReturn(0); 672 } 673 674 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 675 #undef __FUNCT__ 676 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 677 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 678 { 679 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 680 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 681 const PetscScalar *x,*xb; 682 PetscScalar x1,x2,x3,x4,*zarray; 683 const MatScalar *v; 684 PetscErrorCode ierr; 685 const PetscInt *ii,*ij=a->j,*idx; 686 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 687 PetscTruth usecprow=a->compressedrow.use; 688 689 PetscFunctionBegin; 690 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 691 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 692 693 v = a->a; 694 if (usecprow){ 695 mbs = a->compressedrow.nrows; 696 ii = a->compressedrow.i; 697 ridx = a->compressedrow.rindex; 698 } else { 699 mbs = a->mbs; 700 ii = a->i; 701 z = zarray; 702 } 703 704 for (i=0; i<mbs; i++) { 705 n = ii[i+1] - ii[i]; 706 idx = ij + ii[i]; 707 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 708 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 709 710 nonzerorow += (n>0); 711 for (j=0; j<n; j++) { 712 xb = x + 15*(idx[j]); 713 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 714 715 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 716 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 717 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 718 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 719 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 720 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 721 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 722 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 723 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 724 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 725 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 726 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 727 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 728 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 729 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 730 731 v += 60; 732 733 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 734 735 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 736 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 737 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 738 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 739 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 740 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 741 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 742 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 743 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 744 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 745 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 746 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 747 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 748 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 749 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 750 v += 60; 751 752 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 753 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 754 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 755 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 756 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 757 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 758 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 759 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 760 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 761 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 762 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 763 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 764 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 765 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 766 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 767 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 768 v += 60; 769 770 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 771 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 772 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 773 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 774 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 775 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 776 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 777 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 778 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 779 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 780 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 781 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 782 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 783 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 784 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 785 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 786 v += 45; 787 } 788 if (usecprow) z = zarray + 15*ridx[i]; 789 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 790 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 791 792 if (!usecprow) z += 15; 793 } 794 795 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 796 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 797 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 798 PetscFunctionReturn(0); 799 } 800 801 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 802 #undef __FUNCT__ 803 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 804 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 805 { 806 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 807 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 808 const PetscScalar *x,*xb; 809 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 810 const MatScalar *v; 811 PetscErrorCode ierr; 812 const PetscInt *ii,*ij=a->j,*idx; 813 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 814 PetscTruth usecprow=a->compressedrow.use; 815 816 PetscFunctionBegin; 817 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 818 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 819 820 v = a->a; 821 if (usecprow){ 822 mbs = a->compressedrow.nrows; 823 ii = a->compressedrow.i; 824 ridx = a->compressedrow.rindex; 825 } else { 826 mbs = a->mbs; 827 ii = a->i; 828 z = zarray; 829 } 830 831 for (i=0; i<mbs; i++) { 832 n = ii[i+1] - ii[i]; 833 idx = ij + ii[i]; 834 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 835 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 836 837 nonzerorow += (n>0); 838 for (j=0; j<n; j++) { 839 xb = x + 15*(idx[j]); 840 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 841 x8 = xb[7]; 842 843 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 844 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 845 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 846 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 847 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 848 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 849 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 850 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 851 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 852 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 853 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 854 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 855 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 856 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 857 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 858 v += 120; 859 860 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 861 862 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 863 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 864 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 865 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 866 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 867 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 868 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 869 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 870 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 871 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 872 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 873 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 874 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 875 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 876 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 877 v += 105; 878 } 879 if (usecprow) z = zarray + 15*ridx[i]; 880 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 881 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 882 883 if (!usecprow) z += 15; 884 } 885 886 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 887 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 888 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 889 PetscFunctionReturn(0); 890 } 891 892 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 893 894 #undef __FUNCT__ 895 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 896 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 897 { 898 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 899 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 900 const PetscScalar *x,*xb; 901 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 902 const MatScalar *v; 903 PetscErrorCode ierr; 904 const PetscInt *ii,*ij=a->j,*idx; 905 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 906 PetscTruth usecprow=a->compressedrow.use; 907 908 PetscFunctionBegin; 909 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 910 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 911 912 v = a->a; 913 if (usecprow){ 914 mbs = a->compressedrow.nrows; 915 ii = a->compressedrow.i; 916 ridx = a->compressedrow.rindex; 917 } else { 918 mbs = a->mbs; 919 ii = a->i; 920 z = zarray; 921 } 922 923 for (i=0; i<mbs; i++) { 924 n = ii[i+1] - ii[i]; 925 idx = ij + ii[i]; 926 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 927 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 928 929 nonzerorow += (n>0); 930 for (j=0; j<n; j++) { 931 xb = x + 15*(idx[j]); 932 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 933 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 934 935 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 936 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 937 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 938 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 939 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 940 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 941 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 942 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 943 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 944 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 945 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 946 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 947 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 948 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 949 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 950 v += 225; 951 } 952 if (usecprow) z = zarray + 15*ridx[i]; 953 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 954 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 955 956 if (!usecprow) z += 15; 957 } 958 959 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 960 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 961 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 962 PetscFunctionReturn(0); 963 } 964 965 966 /* 967 This will not work with MatScalar == float because it calls the BLAS 968 */ 969 #undef __FUNCT__ 970 #define __FUNCT__ "MatMult_SeqBAIJ_N" 971 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 972 { 973 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 974 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 975 MatScalar *v; 976 PetscErrorCode ierr; 977 PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 978 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 979 PetscTruth usecprow=a->compressedrow.use; 980 981 PetscFunctionBegin; 982 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 983 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 984 985 idx = a->j; 986 v = a->a; 987 if (usecprow){ 988 mbs = a->compressedrow.nrows; 989 ii = a->compressedrow.i; 990 ridx = a->compressedrow.rindex; 991 } else { 992 mbs = a->mbs; 993 ii = a->i; 994 z = zarray; 995 } 996 997 if (!a->mult_work) { 998 k = PetscMax(A->rmap->n,A->cmap->n); 999 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1000 } 1001 work = a->mult_work; 1002 for (i=0; i<mbs; i++) { 1003 n = ii[1] - ii[0]; ii++; 1004 ncols = n*bs; 1005 workt = work; 1006 nonzerorow += (n>0); 1007 for (j=0; j<n; j++) { 1008 xb = x + bs*(*idx++); 1009 for (k=0; k<bs; k++) workt[k] = xb[k]; 1010 workt += bs; 1011 } 1012 if (usecprow) z = zarray + bs*ridx[i]; 1013 Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1014 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1015 v += n*bs2; 1016 if (!usecprow) z += bs; 1017 } 1018 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1019 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1020 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1021 PetscFunctionReturn(0); 1022 } 1023 1024 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 1025 #undef __FUNCT__ 1026 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1027 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1028 { 1029 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1030 const PetscScalar *x; 1031 PetscScalar *y,*z,sum; 1032 const MatScalar *v; 1033 PetscErrorCode ierr; 1034 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1035 const PetscInt *idx,*ii; 1036 PetscTruth usecprow=a->compressedrow.use; 1037 1038 PetscFunctionBegin; 1039 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 1040 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1041 if (zz != yy) { 1042 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1043 } else { 1044 z = y; 1045 } 1046 1047 idx = a->j; 1048 v = a->a; 1049 if (usecprow){ 1050 if (zz != yy){ 1051 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1052 } 1053 mbs = a->compressedrow.nrows; 1054 ii = a->compressedrow.i; 1055 ridx = a->compressedrow.rindex; 1056 } else { 1057 ii = a->i; 1058 } 1059 1060 for (i=0; i<mbs; i++) { 1061 n = ii[1] - ii[0]; 1062 ii++; 1063 if (!usecprow){ 1064 nonzerorow += (n>0); 1065 sum = y[i]; 1066 } else { 1067 sum = y[ridx[i]]; 1068 } 1069 PetscSparseDensePlusDot(sum,x,v,idx,n); 1070 v += n; 1071 idx += n; 1072 if (usecprow){ 1073 z[ridx[i]] = sum; 1074 } else { 1075 z[i] = sum; 1076 } 1077 } 1078 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 1079 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1080 if (zz != yy) { 1081 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1082 } 1083 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1084 PetscFunctionReturn(0); 1085 } 1086 1087 #undef __FUNCT__ 1088 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1089 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1090 { 1091 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1092 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1093 PetscScalar x1,x2,*yarray,*zarray; 1094 MatScalar *v; 1095 PetscErrorCode ierr; 1096 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1097 PetscTruth usecprow=a->compressedrow.use; 1098 1099 PetscFunctionBegin; 1100 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1101 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1102 if (zz != yy) { 1103 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1104 } else { 1105 zarray = yarray; 1106 } 1107 1108 idx = a->j; 1109 v = a->a; 1110 if (usecprow){ 1111 if (zz != yy){ 1112 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1113 } 1114 mbs = a->compressedrow.nrows; 1115 ii = a->compressedrow.i; 1116 ridx = a->compressedrow.rindex; 1117 if (zz != yy){ 1118 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1119 } 1120 } else { 1121 ii = a->i; 1122 y = yarray; 1123 z = zarray; 1124 } 1125 1126 for (i=0; i<mbs; i++) { 1127 n = ii[1] - ii[0]; ii++; 1128 if (usecprow){ 1129 z = zarray + 2*ridx[i]; 1130 y = yarray + 2*ridx[i]; 1131 } 1132 sum1 = y[0]; sum2 = y[1]; 1133 for (j=0; j<n; j++) { 1134 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1135 sum1 += v[0]*x1 + v[2]*x2; 1136 sum2 += v[1]*x1 + v[3]*x2; 1137 v += 4; 1138 } 1139 z[0] = sum1; z[1] = sum2; 1140 if (!usecprow){ 1141 z += 2; y += 2; 1142 } 1143 } 1144 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1145 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1146 if (zz != yy) { 1147 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1148 } 1149 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1150 PetscFunctionReturn(0); 1151 } 1152 1153 #undef __FUNCT__ 1154 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1155 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1156 { 1157 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1158 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1159 MatScalar *v; 1160 PetscErrorCode ierr; 1161 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1162 PetscTruth usecprow=a->compressedrow.use; 1163 1164 PetscFunctionBegin; 1165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1166 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1167 if (zz != yy) { 1168 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1169 } else { 1170 zarray = yarray; 1171 } 1172 1173 idx = a->j; 1174 v = a->a; 1175 if (usecprow){ 1176 if (zz != yy){ 1177 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1178 } 1179 mbs = a->compressedrow.nrows; 1180 ii = a->compressedrow.i; 1181 ridx = a->compressedrow.rindex; 1182 } else { 1183 ii = a->i; 1184 y = yarray; 1185 z = zarray; 1186 } 1187 1188 for (i=0; i<mbs; i++) { 1189 n = ii[1] - ii[0]; ii++; 1190 if (usecprow){ 1191 z = zarray + 3*ridx[i]; 1192 y = yarray + 3*ridx[i]; 1193 } 1194 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1195 for (j=0; j<n; j++) { 1196 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1197 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1198 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1199 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1200 v += 9; 1201 } 1202 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1203 if (!usecprow){ 1204 z += 3; y += 3; 1205 } 1206 } 1207 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1208 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1209 if (zz != yy) { 1210 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1211 } 1212 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1213 PetscFunctionReturn(0); 1214 } 1215 1216 #undef __FUNCT__ 1217 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1218 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1219 { 1220 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1221 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1222 MatScalar *v; 1223 PetscErrorCode ierr; 1224 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1225 PetscTruth usecprow=a->compressedrow.use; 1226 1227 PetscFunctionBegin; 1228 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1229 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1230 if (zz != yy) { 1231 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1232 } else { 1233 zarray = yarray; 1234 } 1235 1236 idx = a->j; 1237 v = a->a; 1238 if (usecprow){ 1239 if (zz != yy){ 1240 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1241 } 1242 mbs = a->compressedrow.nrows; 1243 ii = a->compressedrow.i; 1244 ridx = a->compressedrow.rindex; 1245 } else { 1246 ii = a->i; 1247 y = yarray; 1248 z = zarray; 1249 } 1250 1251 for (i=0; i<mbs; i++) { 1252 n = ii[1] - ii[0]; ii++; 1253 if (usecprow){ 1254 z = zarray + 4*ridx[i]; 1255 y = yarray + 4*ridx[i]; 1256 } 1257 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1258 for (j=0; j<n; j++) { 1259 xb = x + 4*(*idx++); 1260 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1261 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1262 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1263 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1264 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1265 v += 16; 1266 } 1267 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1268 if (!usecprow){ 1269 z += 4; y += 4; 1270 } 1271 } 1272 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1273 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1274 if (zz != yy) { 1275 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1276 } 1277 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1278 PetscFunctionReturn(0); 1279 } 1280 1281 #undef __FUNCT__ 1282 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1283 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1284 { 1285 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1286 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1287 PetscScalar *yarray,*zarray; 1288 MatScalar *v; 1289 PetscErrorCode ierr; 1290 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1291 PetscTruth usecprow=a->compressedrow.use; 1292 1293 PetscFunctionBegin; 1294 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1295 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1296 if (zz != yy) { 1297 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1298 } else { 1299 zarray = yarray; 1300 } 1301 1302 idx = a->j; 1303 v = a->a; 1304 if (usecprow){ 1305 if (zz != yy){ 1306 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1307 } 1308 mbs = a->compressedrow.nrows; 1309 ii = a->compressedrow.i; 1310 ridx = a->compressedrow.rindex; 1311 } else { 1312 ii = a->i; 1313 y = yarray; 1314 z = zarray; 1315 } 1316 1317 for (i=0; i<mbs; i++) { 1318 n = ii[1] - ii[0]; ii++; 1319 if (usecprow){ 1320 z = zarray + 5*ridx[i]; 1321 y = yarray + 5*ridx[i]; 1322 } 1323 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1324 for (j=0; j<n; j++) { 1325 xb = x + 5*(*idx++); 1326 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1327 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1328 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1329 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1330 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1331 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1332 v += 25; 1333 } 1334 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1335 if (!usecprow){ 1336 z += 5; y += 5; 1337 } 1338 } 1339 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1340 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1341 if (zz != yy) { 1342 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1343 } 1344 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1345 PetscFunctionReturn(0); 1346 } 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1349 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1350 { 1351 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1352 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1353 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1354 MatScalar *v; 1355 PetscErrorCode ierr; 1356 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1357 PetscTruth usecprow=a->compressedrow.use; 1358 1359 PetscFunctionBegin; 1360 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1361 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1362 if (zz != yy) { 1363 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1364 } else { 1365 zarray = yarray; 1366 } 1367 1368 idx = a->j; 1369 v = a->a; 1370 if (usecprow){ 1371 if (zz != yy){ 1372 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1373 } 1374 mbs = a->compressedrow.nrows; 1375 ii = a->compressedrow.i; 1376 ridx = a->compressedrow.rindex; 1377 } else { 1378 ii = a->i; 1379 y = yarray; 1380 z = zarray; 1381 } 1382 1383 for (i=0; i<mbs; i++) { 1384 n = ii[1] - ii[0]; ii++; 1385 if (usecprow){ 1386 z = zarray + 6*ridx[i]; 1387 y = yarray + 6*ridx[i]; 1388 } 1389 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1390 for (j=0; j<n; j++) { 1391 xb = x + 6*(*idx++); 1392 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1393 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1394 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1395 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1396 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1397 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1398 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1399 v += 36; 1400 } 1401 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1402 if (!usecprow){ 1403 z += 6; y += 6; 1404 } 1405 } 1406 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1407 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1408 if (zz != yy) { 1409 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1410 } 1411 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1412 PetscFunctionReturn(0); 1413 } 1414 1415 #undef __FUNCT__ 1416 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1417 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1418 { 1419 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1420 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1421 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1422 MatScalar *v; 1423 PetscErrorCode ierr; 1424 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1425 PetscTruth usecprow=a->compressedrow.use; 1426 1427 PetscFunctionBegin; 1428 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1429 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1430 if (zz != yy) { 1431 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1432 } else { 1433 zarray = yarray; 1434 } 1435 1436 idx = a->j; 1437 v = a->a; 1438 if (usecprow){ 1439 if (zz != yy){ 1440 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1441 } 1442 mbs = a->compressedrow.nrows; 1443 ii = a->compressedrow.i; 1444 ridx = a->compressedrow.rindex; 1445 } else { 1446 ii = a->i; 1447 y = yarray; 1448 z = zarray; 1449 } 1450 1451 for (i=0; i<mbs; i++) { 1452 n = ii[1] - ii[0]; ii++; 1453 if (usecprow){ 1454 z = zarray + 7*ridx[i]; 1455 y = yarray + 7*ridx[i]; 1456 } 1457 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1458 for (j=0; j<n; j++) { 1459 xb = x + 7*(*idx++); 1460 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1461 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1462 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1463 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1464 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1465 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1466 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1467 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1468 v += 49; 1469 } 1470 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1471 if (!usecprow){ 1472 z += 7; y += 7; 1473 } 1474 } 1475 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1476 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1477 if (zz != yy) { 1478 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1479 } 1480 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1481 PetscFunctionReturn(0); 1482 } 1483 1484 #undef __FUNCT__ 1485 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1486 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1487 { 1488 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1489 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1490 MatScalar *v; 1491 PetscErrorCode ierr; 1492 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1493 PetscInt ncols,k,*ridx=PETSC_NULL; 1494 PetscTruth usecprow=a->compressedrow.use; 1495 1496 PetscFunctionBegin; 1497 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1498 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1499 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1500 1501 idx = a->j; 1502 v = a->a; 1503 if (usecprow){ 1504 mbs = a->compressedrow.nrows; 1505 ii = a->compressedrow.i; 1506 ridx = a->compressedrow.rindex; 1507 } else { 1508 mbs = a->mbs; 1509 ii = a->i; 1510 z = zarray; 1511 } 1512 1513 if (!a->mult_work) { 1514 k = PetscMax(A->rmap->n,A->cmap->n); 1515 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1516 } 1517 work = a->mult_work; 1518 for (i=0; i<mbs; i++) { 1519 n = ii[1] - ii[0]; ii++; 1520 ncols = n*bs; 1521 workt = work; 1522 for (j=0; j<n; j++) { 1523 xb = x + bs*(*idx++); 1524 for (k=0; k<bs; k++) workt[k] = xb[k]; 1525 workt += bs; 1526 } 1527 if (usecprow) z = zarray + bs*ridx[i]; 1528 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1529 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1530 v += n*bs2; 1531 if (!usecprow){ 1532 z += bs; 1533 } 1534 } 1535 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1536 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1537 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1538 PetscFunctionReturn(0); 1539 } 1540 1541 #undef __FUNCT__ 1542 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1543 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1544 { 1545 PetscScalar zero = 0.0; 1546 PetscErrorCode ierr; 1547 1548 PetscFunctionBegin; 1549 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1550 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1551 PetscFunctionReturn(0); 1552 } 1553 1554 #undef __FUNCT__ 1555 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1556 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1557 { 1558 PetscScalar zero = 0.0; 1559 PetscErrorCode ierr; 1560 1561 PetscFunctionBegin; 1562 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1563 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1564 PetscFunctionReturn(0); 1565 } 1566 1567 #undef __FUNCT__ 1568 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1569 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1570 1571 { 1572 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1573 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1574 MatScalar *v; 1575 PetscErrorCode ierr; 1576 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1577 Mat_CompressedRow cprow = a->compressedrow; 1578 PetscTruth usecprow=cprow.use; 1579 1580 PetscFunctionBegin; 1581 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1583 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1584 1585 idx = a->j; 1586 v = a->a; 1587 if (usecprow){ 1588 mbs = cprow.nrows; 1589 ii = cprow.i; 1590 ridx = cprow.rindex; 1591 } else { 1592 mbs=a->mbs; 1593 ii = a->i; 1594 xb = x; 1595 } 1596 1597 switch (bs) { 1598 case 1: 1599 for (i=0; i<mbs; i++) { 1600 if (usecprow) xb = x + ridx[i]; 1601 x1 = xb[0]; 1602 ib = idx + ii[0]; 1603 n = ii[1] - ii[0]; ii++; 1604 for (j=0; j<n; j++) { 1605 rval = ib[j]; 1606 z[rval] += PetscConj(*v) * x1; 1607 v++; 1608 } 1609 if (!usecprow) xb++; 1610 } 1611 break; 1612 case 2: 1613 for (i=0; i<mbs; i++) { 1614 if (usecprow) xb = x + 2*ridx[i]; 1615 x1 = xb[0]; x2 = xb[1]; 1616 ib = idx + ii[0]; 1617 n = ii[1] - ii[0]; ii++; 1618 for (j=0; j<n; j++) { 1619 rval = ib[j]*2; 1620 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1621 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1622 v += 4; 1623 } 1624 if (!usecprow) xb += 2; 1625 } 1626 break; 1627 case 3: 1628 for (i=0; i<mbs; i++) { 1629 if (usecprow) xb = x + 3*ridx[i]; 1630 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1631 ib = idx + ii[0]; 1632 n = ii[1] - ii[0]; ii++; 1633 for (j=0; j<n; j++) { 1634 rval = ib[j]*3; 1635 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1636 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1637 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1638 v += 9; 1639 } 1640 if (!usecprow) xb += 3; 1641 } 1642 break; 1643 case 4: 1644 for (i=0; i<mbs; i++) { 1645 if (usecprow) xb = x + 4*ridx[i]; 1646 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1647 ib = idx + ii[0]; 1648 n = ii[1] - ii[0]; ii++; 1649 for (j=0; j<n; j++) { 1650 rval = ib[j]*4; 1651 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1652 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1653 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1654 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1655 v += 16; 1656 } 1657 if (!usecprow) xb += 4; 1658 } 1659 break; 1660 case 5: 1661 for (i=0; i<mbs; i++) { 1662 if (usecprow) xb = x + 5*ridx[i]; 1663 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1664 x4 = xb[3]; x5 = xb[4]; 1665 ib = idx + ii[0]; 1666 n = ii[1] - ii[0]; ii++; 1667 for (j=0; j<n; j++) { 1668 rval = ib[j]*5; 1669 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1670 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1671 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1672 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1673 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1674 v += 25; 1675 } 1676 if (!usecprow) xb += 5; 1677 } 1678 break; 1679 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1680 PetscInt ncols,k; 1681 PetscScalar *work,*workt,*xtmp; 1682 1683 SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1684 if (!a->mult_work) { 1685 k = PetscMax(A->rmap->n,A->cmap->n); 1686 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1687 } 1688 work = a->mult_work; 1689 xtmp = x; 1690 for (i=0; i<mbs; i++) { 1691 n = ii[1] - ii[0]; ii++; 1692 ncols = n*bs; 1693 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1694 if (usecprow) { 1695 xtmp = x + bs*ridx[i]; 1696 } 1697 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1698 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1699 v += n*bs2; 1700 if (!usecprow) xtmp += bs; 1701 workt = work; 1702 for (j=0; j<n; j++) { 1703 zb = z + bs*(*idx++); 1704 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1705 workt += bs; 1706 } 1707 } 1708 } 1709 } 1710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1711 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1712 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1713 PetscFunctionReturn(0); 1714 } 1715 1716 #undef __FUNCT__ 1717 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1718 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1719 { 1720 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1721 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1722 MatScalar *v; 1723 PetscErrorCode ierr; 1724 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1725 Mat_CompressedRow cprow = a->compressedrow; 1726 PetscTruth usecprow=cprow.use; 1727 1728 PetscFunctionBegin; 1729 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1730 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1731 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1732 1733 idx = a->j; 1734 v = a->a; 1735 if (usecprow){ 1736 mbs = cprow.nrows; 1737 ii = cprow.i; 1738 ridx = cprow.rindex; 1739 } else { 1740 mbs=a->mbs; 1741 ii = a->i; 1742 xb = x; 1743 } 1744 1745 switch (bs) { 1746 case 1: 1747 for (i=0; i<mbs; i++) { 1748 if (usecprow) xb = x + ridx[i]; 1749 x1 = xb[0]; 1750 ib = idx + ii[0]; 1751 n = ii[1] - ii[0]; ii++; 1752 for (j=0; j<n; j++) { 1753 rval = ib[j]; 1754 z[rval] += *v * x1; 1755 v++; 1756 } 1757 if (!usecprow) xb++; 1758 } 1759 break; 1760 case 2: 1761 for (i=0; i<mbs; i++) { 1762 if (usecprow) xb = x + 2*ridx[i]; 1763 x1 = xb[0]; x2 = xb[1]; 1764 ib = idx + ii[0]; 1765 n = ii[1] - ii[0]; ii++; 1766 for (j=0; j<n; j++) { 1767 rval = ib[j]*2; 1768 z[rval++] += v[0]*x1 + v[1]*x2; 1769 z[rval++] += v[2]*x1 + v[3]*x2; 1770 v += 4; 1771 } 1772 if (!usecprow) xb += 2; 1773 } 1774 break; 1775 case 3: 1776 for (i=0; i<mbs; i++) { 1777 if (usecprow) xb = x + 3*ridx[i]; 1778 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1779 ib = idx + ii[0]; 1780 n = ii[1] - ii[0]; ii++; 1781 for (j=0; j<n; j++) { 1782 rval = ib[j]*3; 1783 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1784 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1785 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1786 v += 9; 1787 } 1788 if (!usecprow) xb += 3; 1789 } 1790 break; 1791 case 4: 1792 for (i=0; i<mbs; i++) { 1793 if (usecprow) xb = x + 4*ridx[i]; 1794 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1795 ib = idx + ii[0]; 1796 n = ii[1] - ii[0]; ii++; 1797 for (j=0; j<n; j++) { 1798 rval = ib[j]*4; 1799 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1800 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1801 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1802 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1803 v += 16; 1804 } 1805 if (!usecprow) xb += 4; 1806 } 1807 break; 1808 case 5: 1809 for (i=0; i<mbs; i++) { 1810 if (usecprow) xb = x + 5*ridx[i]; 1811 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1812 x4 = xb[3]; x5 = xb[4]; 1813 ib = idx + ii[0]; 1814 n = ii[1] - ii[0]; ii++; 1815 for (j=0; j<n; j++) { 1816 rval = ib[j]*5; 1817 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1818 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1819 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1820 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1821 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1822 v += 25; 1823 } 1824 if (!usecprow) xb += 5; 1825 } 1826 break; 1827 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1828 PetscInt ncols,k; 1829 PetscScalar *work,*workt,*xtmp; 1830 1831 if (!a->mult_work) { 1832 k = PetscMax(A->rmap->n,A->cmap->n); 1833 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1834 } 1835 work = a->mult_work; 1836 xtmp = x; 1837 for (i=0; i<mbs; i++) { 1838 n = ii[1] - ii[0]; ii++; 1839 ncols = n*bs; 1840 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1841 if (usecprow) { 1842 xtmp = x + bs*ridx[i]; 1843 } 1844 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1845 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1846 v += n*bs2; 1847 if (!usecprow) xtmp += bs; 1848 workt = work; 1849 for (j=0; j<n; j++) { 1850 zb = z + bs*(*idx++); 1851 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1852 workt += bs; 1853 } 1854 } 1855 } 1856 } 1857 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1858 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1859 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1860 PetscFunctionReturn(0); 1861 } 1862 1863 #undef __FUNCT__ 1864 #define __FUNCT__ "MatScale_SeqBAIJ" 1865 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1866 { 1867 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1868 PetscInt totalnz = a->bs2*a->nz; 1869 PetscScalar oalpha = alpha; 1870 PetscErrorCode ierr; 1871 PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz); 1872 1873 PetscFunctionBegin; 1874 BLASscal_(&tnz,&oalpha,a->a,&one); 1875 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1876 PetscFunctionReturn(0); 1877 } 1878 1879 #undef __FUNCT__ 1880 #define __FUNCT__ "MatNorm_SeqBAIJ" 1881 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1882 { 1883 PetscErrorCode ierr; 1884 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1885 MatScalar *v = a->a; 1886 PetscReal sum = 0.0; 1887 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1888 1889 PetscFunctionBegin; 1890 if (type == NORM_FROBENIUS) { 1891 for (i=0; i< bs2*nz; i++) { 1892 #if defined(PETSC_USE_COMPLEX) 1893 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1894 #else 1895 sum += (*v)*(*v); v++; 1896 #endif 1897 } 1898 *norm = sqrt(sum); 1899 } else if (type == NORM_1) { /* maximum column sum */ 1900 PetscReal *tmp; 1901 PetscInt *bcol = a->j; 1902 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1903 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1904 for (i=0; i<nz; i++){ 1905 for (j=0; j<bs; j++){ 1906 k1 = bs*(*bcol) + j; /* column index */ 1907 for (k=0; k<bs; k++){ 1908 tmp[k1] += PetscAbsScalar(*v); v++; 1909 } 1910 } 1911 bcol++; 1912 } 1913 *norm = 0.0; 1914 for (j=0; j<A->cmap->n; j++) { 1915 if (tmp[j] > *norm) *norm = tmp[j]; 1916 } 1917 ierr = PetscFree(tmp);CHKERRQ(ierr); 1918 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1919 *norm = 0.0; 1920 for (k=0; k<bs; k++) { 1921 for (j=0; j<a->mbs; j++) { 1922 v = a->a + bs2*a->i[j] + k; 1923 sum = 0.0; 1924 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1925 for (k1=0; k1<bs; k1++){ 1926 sum += PetscAbsScalar(*v); 1927 v += bs; 1928 } 1929 } 1930 if (sum > *norm) *norm = sum; 1931 } 1932 } 1933 } else { 1934 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1935 } 1936 PetscFunctionReturn(0); 1937 } 1938 1939 1940 #undef __FUNCT__ 1941 #define __FUNCT__ "MatEqual_SeqBAIJ" 1942 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg) 1943 { 1944 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1945 PetscErrorCode ierr; 1946 1947 PetscFunctionBegin; 1948 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1949 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1950 *flg = PETSC_FALSE; 1951 PetscFunctionReturn(0); 1952 } 1953 1954 /* if the a->i are the same */ 1955 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1956 if (!*flg) { 1957 PetscFunctionReturn(0); 1958 } 1959 1960 /* if a->j are the same */ 1961 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1962 if (!*flg) { 1963 PetscFunctionReturn(0); 1964 } 1965 /* if a->a are the same */ 1966 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1967 PetscFunctionReturn(0); 1968 1969 } 1970 1971 #undef __FUNCT__ 1972 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1973 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1974 { 1975 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1976 PetscErrorCode ierr; 1977 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1978 PetscScalar *x,zero = 0.0; 1979 MatScalar *aa,*aa_j; 1980 1981 PetscFunctionBegin; 1982 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1983 bs = A->rmap->bs; 1984 aa = a->a; 1985 ai = a->i; 1986 aj = a->j; 1987 ambs = a->mbs; 1988 bs2 = a->bs2; 1989 1990 ierr = VecSet(v,zero);CHKERRQ(ierr); 1991 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1992 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1993 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1994 for (i=0; i<ambs; i++) { 1995 for (j=ai[i]; j<ai[i+1]; j++) { 1996 if (aj[j] == i) { 1997 row = i*bs; 1998 aa_j = aa+j*bs2; 1999 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2000 break; 2001 } 2002 } 2003 } 2004 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2005 PetscFunctionReturn(0); 2006 } 2007 2008 #undef __FUNCT__ 2009 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2010 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2011 { 2012 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2013 PetscScalar *l,*r,x,*li,*ri; 2014 MatScalar *aa,*v; 2015 PetscErrorCode ierr; 2016 PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 2017 2018 PetscFunctionBegin; 2019 ai = a->i; 2020 aj = a->j; 2021 aa = a->a; 2022 m = A->rmap->n; 2023 n = A->cmap->n; 2024 bs = A->rmap->bs; 2025 mbs = a->mbs; 2026 bs2 = a->bs2; 2027 if (ll) { 2028 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 2029 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2030 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2031 for (i=0; i<mbs; i++) { /* for each block row */ 2032 M = ai[i+1] - ai[i]; 2033 li = l + i*bs; 2034 v = aa + bs2*ai[i]; 2035 for (j=0; j<M; j++) { /* for each block */ 2036 for (k=0; k<bs2; k++) { 2037 (*v++) *= li[k%bs]; 2038 } 2039 } 2040 } 2041 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 2042 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2043 } 2044 2045 if (rr) { 2046 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 2047 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2048 if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2049 for (i=0; i<mbs; i++) { /* for each block row */ 2050 M = ai[i+1] - ai[i]; 2051 v = aa + bs2*ai[i]; 2052 for (j=0; j<M; j++) { /* for each block */ 2053 ri = r + bs*aj[ai[i]+j]; 2054 for (k=0; k<bs; k++) { 2055 x = ri[k]; 2056 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 2057 } 2058 } 2059 } 2060 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 2061 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2062 } 2063 PetscFunctionReturn(0); 2064 } 2065 2066 2067 #undef __FUNCT__ 2068 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2069 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2070 { 2071 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2072 2073 PetscFunctionBegin; 2074 info->block_size = a->bs2; 2075 info->nz_allocated = a->maxnz; 2076 info->nz_used = a->bs2*a->nz; 2077 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2078 info->assemblies = A->num_ass; 2079 info->mallocs = a->reallocs; 2080 info->memory = ((PetscObject)A)->mem; 2081 if (A->factor) { 2082 info->fill_ratio_given = A->info.fill_ratio_given; 2083 info->fill_ratio_needed = A->info.fill_ratio_needed; 2084 info->factor_mallocs = A->info.factor_mallocs; 2085 } else { 2086 info->fill_ratio_given = 0; 2087 info->fill_ratio_needed = 0; 2088 info->factor_mallocs = 0; 2089 } 2090 PetscFunctionReturn(0); 2091 } 2092 2093 2094 #undef __FUNCT__ 2095 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2096 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2097 { 2098 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2099 PetscErrorCode ierr; 2100 2101 PetscFunctionBegin; 2102 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2103 PetscFunctionReturn(0); 2104 } 2105