1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "petscblaslapack.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 15 const PetscInt *idx; 16 PetscInt start,end,*ai,*aj,bs,*nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 28 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 29 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 30 31 for (i=0; i<is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 38 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j=0; j<n ; ++j){ 42 ival = idx[j]/bs; /* convert the indices into block indices */ 43 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++){ /* for each overlap*/ 51 n = isz; 52 for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row+1]; 56 for (l = start; l<end ; l++){ 57 val = aj[l]; 58 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j=0; j<isz; j++) { 64 for (k=0; k<bs; k++) 65 nidx2[j*bs+k] = nidx[j]*bs+k; 66 } 67 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 68 } 69 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 70 ierr = PetscFree(nidx);CHKERRQ(ierr); 71 ierr = PetscFree(nidx2);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 #undef __FUNCT__ 76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 78 { 79 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 80 PetscErrorCode ierr; 81 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 82 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 83 const PetscInt *irow,*icol; 84 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 85 PetscInt *aj = a->j,*ai = a->i; 86 MatScalar *mat_a; 87 Mat C; 88 PetscBool flag,sorted; 89 90 PetscFunctionBegin; 91 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 92 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 93 94 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 95 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 98 99 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 100 ssmap = smap; 101 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 102 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 103 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 104 /* determine lens of each row */ 105 for (i=0; i<nrows; i++) { 106 kstart = ai[irow[i]]; 107 kend = kstart + a->ilen[irow[i]]; 108 lens[i] = 0; 109 for (k=kstart; k<kend; k++) { 110 if (ssmap[aj[k]]) { 111 lens[i]++; 112 } 113 } 114 } 115 /* Create and fill new matrix */ 116 if (scall == MAT_REUSE_MATRIX) { 117 c = (Mat_SeqBAIJ *)((*B)->data); 118 119 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 120 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 121 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 122 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 123 C = *B; 124 } else { 125 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 126 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 127 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 128 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 129 } 130 c = (Mat_SeqBAIJ *)(C->data); 131 for (i=0; i<nrows; i++) { 132 row = irow[i]; 133 kstart = ai[row]; 134 kend = kstart + a->ilen[row]; 135 mat_i = c->i[i]; 136 mat_j = c->j + mat_i; 137 mat_a = c->a + mat_i*bs2; 138 mat_ilen = c->ilen + i; 139 for (k=kstart; k<kend; k++) { 140 if ((tcol=ssmap[a->j[k]])) { 141 *mat_j++ = tcol - 1; 142 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 143 mat_a += bs2; 144 (*mat_ilen)++; 145 } 146 } 147 } 148 149 /* Free work space */ 150 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 151 ierr = PetscFree(smap);CHKERRQ(ierr); 152 ierr = PetscFree(lens);CHKERRQ(ierr); 153 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 154 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 155 156 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 157 *B = C; 158 PetscFunctionReturn(0); 159 } 160 161 #undef __FUNCT__ 162 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 164 { 165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 166 IS is1,is2; 167 PetscErrorCode ierr; 168 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 169 const PetscInt *irow,*icol; 170 171 PetscFunctionBegin; 172 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 173 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 176 177 /* Verify if the indices corespond to each element in a block 178 and form the IS with compressed IS */ 179 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 180 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 181 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 182 count = 0; 183 for (i=0; i<a->mbs; i++) { 184 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 185 if (vary[i]==bs) iary[count++] = i; 186 } 187 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 188 189 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 190 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 191 count = 0; 192 for (i=0; i<a->mbs; i++) { 193 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 194 if (vary[i]==bs) iary[count++] = i; 195 } 196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 197 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 198 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 199 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 200 201 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 202 ISDestroy(is1); 203 ISDestroy(is2); 204 PetscFunctionReturn(0); 205 } 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 209 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 210 { 211 PetscErrorCode ierr; 212 PetscInt i; 213 214 PetscFunctionBegin; 215 if (scall == MAT_INITIAL_MATRIX) { 216 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 217 } 218 219 for (i=0; i<n; i++) { 220 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 221 } 222 PetscFunctionReturn(0); 223 } 224 225 226 /* -------------------------------------------------------*/ 227 /* Should check that shapes of vectors and matrices match */ 228 /* -------------------------------------------------------*/ 229 230 #undef __FUNCT__ 231 #define __FUNCT__ "MatMult_SeqBAIJ_1" 232 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 233 { 234 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 235 PetscScalar *z,sum; 236 const PetscScalar *x; 237 const MatScalar *v; 238 PetscErrorCode ierr; 239 PetscInt mbs,i,n,nonzerorow=0; 240 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 241 PetscBool usecprow=a->compressedrow.use; 242 243 PetscFunctionBegin; 244 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 245 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 246 247 if (usecprow){ 248 mbs = a->compressedrow.nrows; 249 ii = a->compressedrow.i; 250 ridx = a->compressedrow.rindex; 251 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 252 } else { 253 mbs = a->mbs; 254 ii = a->i; 255 } 256 257 for (i=0; i<mbs; i++) { 258 n = ii[1] - ii[0]; 259 v = a->a + ii[0]; 260 idx = a->j + ii[0]; 261 ii++; 262 sum = 0.0; 263 PetscSparseDensePlusDot(sum,x,v,idx,n); 264 if (usecprow){ 265 z[ridx[i]] = sum; 266 } else { 267 nonzerorow += (n>0); 268 z[i] = sum; 269 } 270 } 271 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 272 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 273 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatMult_SeqBAIJ_2" 279 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 282 PetscScalar *z = 0,sum1,sum2,*zarray; 283 const PetscScalar *x,*xb; 284 PetscScalar x1,x2; 285 const MatScalar *v; 286 PetscErrorCode ierr; 287 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 288 PetscBool usecprow=a->compressedrow.use; 289 290 PetscFunctionBegin; 291 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 292 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 293 294 idx = a->j; 295 v = a->a; 296 if (usecprow){ 297 mbs = a->compressedrow.nrows; 298 ii = a->compressedrow.i; 299 ridx = a->compressedrow.rindex; 300 } else { 301 mbs = a->mbs; 302 ii = a->i; 303 z = zarray; 304 } 305 306 for (i=0; i<mbs; i++) { 307 n = ii[1] - ii[0]; ii++; 308 sum1 = 0.0; sum2 = 0.0; 309 nonzerorow += (n>0); 310 for (j=0; j<n; j++) { 311 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 312 sum1 += v[0]*x1 + v[2]*x2; 313 sum2 += v[1]*x1 + v[3]*x2; 314 v += 4; 315 } 316 if (usecprow) z = zarray + 2*ridx[i]; 317 z[0] = sum1; z[1] = sum2; 318 if (!usecprow) z += 2; 319 } 320 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 321 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 322 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 #undef __FUNCT__ 327 #define __FUNCT__ "MatMult_SeqBAIJ_3" 328 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 329 { 330 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 331 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 332 const PetscScalar *x,*xb; 333 const MatScalar *v; 334 PetscErrorCode ierr; 335 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 336 PetscBool usecprow=a->compressedrow.use; 337 338 339 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 340 #pragma disjoint(*v,*z,*xb) 341 #endif 342 343 PetscFunctionBegin; 344 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 345 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 346 347 idx = a->j; 348 v = a->a; 349 if (usecprow){ 350 mbs = a->compressedrow.nrows; 351 ii = a->compressedrow.i; 352 ridx = a->compressedrow.rindex; 353 } else { 354 mbs = a->mbs; 355 ii = a->i; 356 z = zarray; 357 } 358 359 for (i=0; i<mbs; i++) { 360 n = ii[1] - ii[0]; ii++; 361 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 362 nonzerorow += (n>0); 363 for (j=0; j<n; j++) { 364 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 365 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 366 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 367 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 368 v += 9; 369 } 370 if (usecprow) z = zarray + 3*ridx[i]; 371 z[0] = sum1; z[1] = sum2; z[2] = sum3; 372 if (!usecprow) z += 3; 373 } 374 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 375 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 376 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 #undef __FUNCT__ 381 #define __FUNCT__ "MatMult_SeqBAIJ_4" 382 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 383 { 384 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 385 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 386 const PetscScalar *x,*xb; 387 const MatScalar *v; 388 PetscErrorCode ierr; 389 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 390 PetscBool usecprow=a->compressedrow.use; 391 392 PetscFunctionBegin; 393 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 394 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 395 396 idx = a->j; 397 v = a->a; 398 if (usecprow){ 399 mbs = a->compressedrow.nrows; 400 ii = a->compressedrow.i; 401 ridx = a->compressedrow.rindex; 402 } else { 403 mbs = a->mbs; 404 ii = a->i; 405 z = zarray; 406 } 407 408 for (i=0; i<mbs; i++) { 409 n = ii[1] - ii[0]; ii++; 410 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 411 nonzerorow += (n>0); 412 for (j=0; j<n; j++) { 413 xb = x + 4*(*idx++); 414 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 415 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 416 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 417 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 418 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 419 v += 16; 420 } 421 if (usecprow) z = zarray + 4*ridx[i]; 422 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 423 if (!usecprow) z += 4; 424 } 425 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 426 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 427 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 #undef __FUNCT__ 432 #define __FUNCT__ "MatMult_SeqBAIJ_5" 433 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 434 { 435 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 436 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 437 const PetscScalar *xb,*x; 438 const MatScalar *v; 439 PetscErrorCode ierr; 440 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 441 PetscInt mbs,i,j,n,nonzerorow=0; 442 PetscBool usecprow=a->compressedrow.use; 443 444 PetscFunctionBegin; 445 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 446 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 447 448 idx = a->j; 449 v = a->a; 450 if (usecprow){ 451 mbs = a->compressedrow.nrows; 452 ii = a->compressedrow.i; 453 ridx = a->compressedrow.rindex; 454 } else { 455 mbs = a->mbs; 456 ii = a->i; 457 z = zarray; 458 } 459 460 for (i=0; i<mbs; i++) { 461 n = ii[1] - ii[0]; ii++; 462 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 463 nonzerorow += (n>0); 464 for (j=0; j<n; j++) { 465 xb = x + 5*(*idx++); 466 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 467 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 468 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 469 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 470 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 471 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 472 v += 25; 473 } 474 if (usecprow) z = zarray + 5*ridx[i]; 475 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 476 if (!usecprow) z += 5; 477 } 478 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 479 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 480 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 481 PetscFunctionReturn(0); 482 } 483 484 485 #undef __FUNCT__ 486 #define __FUNCT__ "MatMult_SeqBAIJ_6" 487 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 488 { 489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 490 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 491 const PetscScalar *x,*xb; 492 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 493 const MatScalar *v; 494 PetscErrorCode ierr; 495 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 496 PetscBool usecprow=a->compressedrow.use; 497 498 PetscFunctionBegin; 499 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 500 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 501 502 idx = a->j; 503 v = a->a; 504 if (usecprow){ 505 mbs = a->compressedrow.nrows; 506 ii = a->compressedrow.i; 507 ridx = a->compressedrow.rindex; 508 } else { 509 mbs = a->mbs; 510 ii = a->i; 511 z = zarray; 512 } 513 514 for (i=0; i<mbs; i++) { 515 n = ii[1] - ii[0]; ii++; 516 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 517 nonzerorow += (n>0); 518 for (j=0; j<n; j++) { 519 xb = x + 6*(*idx++); 520 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 521 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 522 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 523 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 524 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 525 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 526 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 527 v += 36; 528 } 529 if (usecprow) z = zarray + 6*ridx[i]; 530 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 531 if (!usecprow) z += 6; 532 } 533 534 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 535 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 536 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 #undef __FUNCT__ 541 #define __FUNCT__ "MatMult_SeqBAIJ_7" 542 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 543 { 544 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 545 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 546 const PetscScalar *x,*xb; 547 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 548 const MatScalar *v; 549 PetscErrorCode ierr; 550 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 551 PetscBool usecprow=a->compressedrow.use; 552 553 PetscFunctionBegin; 554 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 555 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 556 557 idx = a->j; 558 v = a->a; 559 if (usecprow){ 560 mbs = a->compressedrow.nrows; 561 ii = a->compressedrow.i; 562 ridx = a->compressedrow.rindex; 563 } else { 564 mbs = a->mbs; 565 ii = a->i; 566 z = zarray; 567 } 568 569 for (i=0; i<mbs; i++) { 570 n = ii[1] - ii[0]; ii++; 571 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 572 nonzerorow += (n>0); 573 for (j=0; j<n; j++) { 574 xb = x + 7*(*idx++); 575 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 576 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 577 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 578 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 579 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 580 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 581 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 582 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 583 v += 49; 584 } 585 if (usecprow) z = zarray + 7*ridx[i]; 586 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 587 if (!usecprow) z += 7; 588 } 589 590 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 591 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 592 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 593 PetscFunctionReturn(0); 594 } 595 596 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 597 /* Default MatMult for block size 15 */ 598 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 601 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 602 { 603 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 604 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 605 const PetscScalar *x,*xb; 606 PetscScalar *zarray,xv; 607 const MatScalar *v; 608 PetscErrorCode ierr; 609 const PetscInt *ii,*ij=a->j,*idx; 610 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 611 PetscBool usecprow=a->compressedrow.use; 612 613 PetscFunctionBegin; 614 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 615 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 616 617 v = a->a; 618 if (usecprow){ 619 mbs = a->compressedrow.nrows; 620 ii = a->compressedrow.i; 621 ridx = a->compressedrow.rindex; 622 } else { 623 mbs = a->mbs; 624 ii = a->i; 625 z = zarray; 626 } 627 628 for (i=0; i<mbs; i++) { 629 n = ii[i+1] - ii[i]; 630 idx = ij + ii[i]; 631 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 632 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 633 634 nonzerorow += (n>0); 635 for (j=0; j<n; j++) { 636 xb = x + 15*(idx[j]); 637 638 for(k=0;k<15;k++){ 639 xv = xb[k]; 640 sum1 += v[0]*xv; 641 sum2 += v[1]*xv; 642 sum3 += v[2]*xv; 643 sum4 += v[3]*xv; 644 sum5 += v[4]*xv; 645 sum6 += v[5]*xv; 646 sum7 += v[6]*xv; 647 sum8 += v[7]*xv; 648 sum9 += v[8]*xv; 649 sum10 += v[9]*xv; 650 sum11 += v[10]*xv; 651 sum12 += v[11]*xv; 652 sum13 += v[12]*xv; 653 sum14 += v[13]*xv; 654 sum15 += v[14]*xv; 655 v += 15; 656 } 657 } 658 if (usecprow) z = zarray + 15*ridx[i]; 659 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 660 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 661 662 if (!usecprow) z += 15; 663 } 664 665 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 666 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 667 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 668 PetscFunctionReturn(0); 669 } 670 671 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 672 #undef __FUNCT__ 673 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 674 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 675 { 676 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 677 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 678 const PetscScalar *x,*xb; 679 PetscScalar x1,x2,x3,x4,*zarray; 680 const MatScalar *v; 681 PetscErrorCode ierr; 682 const PetscInt *ii,*ij=a->j,*idx; 683 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 684 PetscBool usecprow=a->compressedrow.use; 685 686 PetscFunctionBegin; 687 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 688 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 689 690 v = a->a; 691 if (usecprow){ 692 mbs = a->compressedrow.nrows; 693 ii = a->compressedrow.i; 694 ridx = a->compressedrow.rindex; 695 } else { 696 mbs = a->mbs; 697 ii = a->i; 698 z = zarray; 699 } 700 701 for (i=0; i<mbs; i++) { 702 n = ii[i+1] - ii[i]; 703 idx = ij + ii[i]; 704 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 705 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 706 707 nonzerorow += (n>0); 708 for (j=0; j<n; j++) { 709 xb = x + 15*(idx[j]); 710 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 711 712 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 713 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 714 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 715 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 716 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 717 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 718 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 719 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 720 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 721 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 722 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 723 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 724 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 725 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 726 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 727 728 v += 60; 729 730 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 731 732 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 733 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 734 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 735 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 736 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 737 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 738 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 739 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 740 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 741 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 742 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 743 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 744 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 745 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 746 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 747 v += 60; 748 749 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 750 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 751 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 752 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 753 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 754 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 755 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 756 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 757 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 758 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 759 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 760 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 761 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 762 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 763 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 764 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 765 v += 60; 766 767 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 768 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 769 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 770 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 771 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 772 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 773 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 774 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 775 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 776 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 777 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 778 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 779 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 780 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 781 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 782 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 783 v += 45; 784 } 785 if (usecprow) z = zarray + 15*ridx[i]; 786 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 787 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 788 789 if (!usecprow) z += 15; 790 } 791 792 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 793 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 794 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 795 PetscFunctionReturn(0); 796 } 797 798 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 799 #undef __FUNCT__ 800 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 801 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 802 { 803 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 804 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 805 const PetscScalar *x,*xb; 806 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 807 const MatScalar *v; 808 PetscErrorCode ierr; 809 const PetscInt *ii,*ij=a->j,*idx; 810 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 811 PetscBool usecprow=a->compressedrow.use; 812 813 PetscFunctionBegin; 814 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 815 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 816 817 v = a->a; 818 if (usecprow){ 819 mbs = a->compressedrow.nrows; 820 ii = a->compressedrow.i; 821 ridx = a->compressedrow.rindex; 822 } else { 823 mbs = a->mbs; 824 ii = a->i; 825 z = zarray; 826 } 827 828 for (i=0; i<mbs; i++) { 829 n = ii[i+1] - ii[i]; 830 idx = ij + ii[i]; 831 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 832 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 833 834 nonzerorow += (n>0); 835 for (j=0; j<n; j++) { 836 xb = x + 15*(idx[j]); 837 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 838 x8 = xb[7]; 839 840 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 841 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 842 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 843 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 844 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 845 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 846 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 847 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 848 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 849 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 850 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 851 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 852 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 853 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 854 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 855 v += 120; 856 857 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 858 859 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 860 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 861 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 862 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 863 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 864 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 865 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 866 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 867 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 868 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 869 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 870 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 871 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 872 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 873 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 874 v += 105; 875 } 876 if (usecprow) z = zarray + 15*ridx[i]; 877 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 878 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 879 880 if (!usecprow) z += 15; 881 } 882 883 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 884 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 885 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 886 PetscFunctionReturn(0); 887 } 888 889 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 890 891 #undef __FUNCT__ 892 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 893 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 894 { 895 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 896 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 897 const PetscScalar *x,*xb; 898 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 899 const MatScalar *v; 900 PetscErrorCode ierr; 901 const PetscInt *ii,*ij=a->j,*idx; 902 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 903 PetscBool usecprow=a->compressedrow.use; 904 905 PetscFunctionBegin; 906 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 907 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 908 909 v = a->a; 910 if (usecprow){ 911 mbs = a->compressedrow.nrows; 912 ii = a->compressedrow.i; 913 ridx = a->compressedrow.rindex; 914 } else { 915 mbs = a->mbs; 916 ii = a->i; 917 z = zarray; 918 } 919 920 for (i=0; i<mbs; i++) { 921 n = ii[i+1] - ii[i]; 922 idx = ij + ii[i]; 923 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 924 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 925 926 nonzerorow += (n>0); 927 for (j=0; j<n; j++) { 928 xb = x + 15*(idx[j]); 929 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 930 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 931 932 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 933 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 934 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 935 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 936 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 937 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 938 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 939 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 940 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 941 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 942 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 943 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 944 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 945 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 946 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 947 v += 225; 948 } 949 if (usecprow) z = zarray + 15*ridx[i]; 950 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 951 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 952 953 if (!usecprow) z += 15; 954 } 955 956 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 957 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 958 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 959 PetscFunctionReturn(0); 960 } 961 962 963 /* 964 This will not work with MatScalar == float because it calls the BLAS 965 */ 966 #undef __FUNCT__ 967 #define __FUNCT__ "MatMult_SeqBAIJ_N" 968 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 969 { 970 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 971 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 972 MatScalar *v; 973 PetscErrorCode ierr; 974 PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 975 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 976 PetscBool usecprow=a->compressedrow.use; 977 978 PetscFunctionBegin; 979 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 980 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 981 982 idx = a->j; 983 v = a->a; 984 if (usecprow){ 985 mbs = a->compressedrow.nrows; 986 ii = a->compressedrow.i; 987 ridx = a->compressedrow.rindex; 988 } else { 989 mbs = a->mbs; 990 ii = a->i; 991 z = zarray; 992 } 993 994 if (!a->mult_work) { 995 k = PetscMax(A->rmap->n,A->cmap->n); 996 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 997 } 998 work = a->mult_work; 999 for (i=0; i<mbs; i++) { 1000 n = ii[1] - ii[0]; ii++; 1001 ncols = n*bs; 1002 workt = work; 1003 nonzerorow += (n>0); 1004 for (j=0; j<n; j++) { 1005 xb = x + bs*(*idx++); 1006 for (k=0; k<bs; k++) workt[k] = xb[k]; 1007 workt += bs; 1008 } 1009 if (usecprow) z = zarray + bs*ridx[i]; 1010 Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1011 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1012 v += n*bs2; 1013 if (!usecprow) z += bs; 1014 } 1015 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1016 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1017 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1018 PetscFunctionReturn(0); 1019 } 1020 1021 #undef __FUNCT__ 1022 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1023 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1024 { 1025 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1026 const PetscScalar *x; 1027 PetscScalar *y,*z,sum; 1028 const MatScalar *v; 1029 PetscErrorCode ierr; 1030 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1031 const PetscInt *idx,*ii; 1032 PetscBool usecprow=a->compressedrow.use; 1033 1034 PetscFunctionBegin; 1035 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1036 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1037 if (zz != yy) { 1038 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1039 } else { 1040 z = y; 1041 } 1042 1043 idx = a->j; 1044 v = a->a; 1045 if (usecprow){ 1046 if (zz != yy){ 1047 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1048 } 1049 mbs = a->compressedrow.nrows; 1050 ii = a->compressedrow.i; 1051 ridx = a->compressedrow.rindex; 1052 } else { 1053 ii = a->i; 1054 } 1055 1056 for (i=0; i<mbs; i++) { 1057 n = ii[1] - ii[0]; 1058 ii++; 1059 if (!usecprow){ 1060 nonzerorow += (n>0); 1061 sum = y[i]; 1062 } else { 1063 sum = y[ridx[i]]; 1064 } 1065 PetscSparseDensePlusDot(sum,x,v,idx,n); 1066 v += n; 1067 idx += n; 1068 if (usecprow){ 1069 z[ridx[i]] = sum; 1070 } else { 1071 z[i] = sum; 1072 } 1073 } 1074 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1075 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1076 if (zz != yy) { 1077 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1078 } 1079 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1080 PetscFunctionReturn(0); 1081 } 1082 1083 #undef __FUNCT__ 1084 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1085 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1086 { 1087 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1088 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1089 PetscScalar x1,x2,*yarray,*zarray; 1090 MatScalar *v; 1091 PetscErrorCode ierr; 1092 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1093 PetscBool usecprow=a->compressedrow.use; 1094 1095 PetscFunctionBegin; 1096 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1097 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1098 if (zz != yy) { 1099 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1100 } else { 1101 zarray = yarray; 1102 } 1103 1104 idx = a->j; 1105 v = a->a; 1106 if (usecprow){ 1107 if (zz != yy){ 1108 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1109 } 1110 mbs = a->compressedrow.nrows; 1111 ii = a->compressedrow.i; 1112 ridx = a->compressedrow.rindex; 1113 if (zz != yy){ 1114 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1115 } 1116 } else { 1117 ii = a->i; 1118 y = yarray; 1119 z = zarray; 1120 } 1121 1122 for (i=0; i<mbs; i++) { 1123 n = ii[1] - ii[0]; ii++; 1124 if (usecprow){ 1125 z = zarray + 2*ridx[i]; 1126 y = yarray + 2*ridx[i]; 1127 } 1128 sum1 = y[0]; sum2 = y[1]; 1129 for (j=0; j<n; j++) { 1130 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1131 sum1 += v[0]*x1 + v[2]*x2; 1132 sum2 += v[1]*x1 + v[3]*x2; 1133 v += 4; 1134 } 1135 z[0] = sum1; z[1] = sum2; 1136 if (!usecprow){ 1137 z += 2; y += 2; 1138 } 1139 } 1140 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1141 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1142 if (zz != yy) { 1143 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1144 } 1145 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1146 PetscFunctionReturn(0); 1147 } 1148 1149 #undef __FUNCT__ 1150 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1151 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1152 { 1153 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1154 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1155 MatScalar *v; 1156 PetscErrorCode ierr; 1157 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1158 PetscBool usecprow=a->compressedrow.use; 1159 1160 PetscFunctionBegin; 1161 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1162 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1163 if (zz != yy) { 1164 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1165 } else { 1166 zarray = yarray; 1167 } 1168 1169 idx = a->j; 1170 v = a->a; 1171 if (usecprow){ 1172 if (zz != yy){ 1173 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1174 } 1175 mbs = a->compressedrow.nrows; 1176 ii = a->compressedrow.i; 1177 ridx = a->compressedrow.rindex; 1178 } else { 1179 ii = a->i; 1180 y = yarray; 1181 z = zarray; 1182 } 1183 1184 for (i=0; i<mbs; i++) { 1185 n = ii[1] - ii[0]; ii++; 1186 if (usecprow){ 1187 z = zarray + 3*ridx[i]; 1188 y = yarray + 3*ridx[i]; 1189 } 1190 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1191 for (j=0; j<n; j++) { 1192 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1193 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1194 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1195 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1196 v += 9; 1197 } 1198 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1199 if (!usecprow){ 1200 z += 3; y += 3; 1201 } 1202 } 1203 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1204 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1205 if (zz != yy) { 1206 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1207 } 1208 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1209 PetscFunctionReturn(0); 1210 } 1211 1212 #undef __FUNCT__ 1213 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1214 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1215 { 1216 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1217 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1218 MatScalar *v; 1219 PetscErrorCode ierr; 1220 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1221 PetscBool usecprow=a->compressedrow.use; 1222 1223 PetscFunctionBegin; 1224 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1225 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1226 if (zz != yy) { 1227 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1228 } else { 1229 zarray = yarray; 1230 } 1231 1232 idx = a->j; 1233 v = a->a; 1234 if (usecprow){ 1235 if (zz != yy){ 1236 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1237 } 1238 mbs = a->compressedrow.nrows; 1239 ii = a->compressedrow.i; 1240 ridx = a->compressedrow.rindex; 1241 } else { 1242 ii = a->i; 1243 y = yarray; 1244 z = zarray; 1245 } 1246 1247 for (i=0; i<mbs; i++) { 1248 n = ii[1] - ii[0]; ii++; 1249 if (usecprow){ 1250 z = zarray + 4*ridx[i]; 1251 y = yarray + 4*ridx[i]; 1252 } 1253 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1254 for (j=0; j<n; j++) { 1255 xb = x + 4*(*idx++); 1256 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1257 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1258 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1259 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1260 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1261 v += 16; 1262 } 1263 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1264 if (!usecprow){ 1265 z += 4; y += 4; 1266 } 1267 } 1268 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1269 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1270 if (zz != yy) { 1271 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1272 } 1273 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1274 PetscFunctionReturn(0); 1275 } 1276 1277 #undef __FUNCT__ 1278 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1279 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1280 { 1281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1282 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1283 PetscScalar *yarray,*zarray; 1284 MatScalar *v; 1285 PetscErrorCode ierr; 1286 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1287 PetscBool usecprow=a->compressedrow.use; 1288 1289 PetscFunctionBegin; 1290 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1291 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1292 if (zz != yy) { 1293 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1294 } else { 1295 zarray = yarray; 1296 } 1297 1298 idx = a->j; 1299 v = a->a; 1300 if (usecprow){ 1301 if (zz != yy){ 1302 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1303 } 1304 mbs = a->compressedrow.nrows; 1305 ii = a->compressedrow.i; 1306 ridx = a->compressedrow.rindex; 1307 } else { 1308 ii = a->i; 1309 y = yarray; 1310 z = zarray; 1311 } 1312 1313 for (i=0; i<mbs; i++) { 1314 n = ii[1] - ii[0]; ii++; 1315 if (usecprow){ 1316 z = zarray + 5*ridx[i]; 1317 y = yarray + 5*ridx[i]; 1318 } 1319 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1320 for (j=0; j<n; j++) { 1321 xb = x + 5*(*idx++); 1322 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1323 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1324 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1325 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1326 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1327 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1328 v += 25; 1329 } 1330 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1331 if (!usecprow){ 1332 z += 5; y += 5; 1333 } 1334 } 1335 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1336 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1337 if (zz != yy) { 1338 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1339 } 1340 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1341 PetscFunctionReturn(0); 1342 } 1343 #undef __FUNCT__ 1344 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1345 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1346 { 1347 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1348 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1349 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1350 MatScalar *v; 1351 PetscErrorCode ierr; 1352 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1353 PetscBool usecprow=a->compressedrow.use; 1354 1355 PetscFunctionBegin; 1356 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1357 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1358 if (zz != yy) { 1359 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1360 } else { 1361 zarray = yarray; 1362 } 1363 1364 idx = a->j; 1365 v = a->a; 1366 if (usecprow){ 1367 if (zz != yy){ 1368 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1369 } 1370 mbs = a->compressedrow.nrows; 1371 ii = a->compressedrow.i; 1372 ridx = a->compressedrow.rindex; 1373 } else { 1374 ii = a->i; 1375 y = yarray; 1376 z = zarray; 1377 } 1378 1379 for (i=0; i<mbs; i++) { 1380 n = ii[1] - ii[0]; ii++; 1381 if (usecprow){ 1382 z = zarray + 6*ridx[i]; 1383 y = yarray + 6*ridx[i]; 1384 } 1385 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1386 for (j=0; j<n; j++) { 1387 xb = x + 6*(*idx++); 1388 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1389 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1390 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1391 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1392 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1393 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1394 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1395 v += 36; 1396 } 1397 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1398 if (!usecprow){ 1399 z += 6; y += 6; 1400 } 1401 } 1402 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1403 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1404 if (zz != yy) { 1405 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1406 } 1407 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1408 PetscFunctionReturn(0); 1409 } 1410 1411 #undef __FUNCT__ 1412 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1413 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1414 { 1415 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1416 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1417 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1418 MatScalar *v; 1419 PetscErrorCode ierr; 1420 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1421 PetscBool usecprow=a->compressedrow.use; 1422 1423 PetscFunctionBegin; 1424 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1425 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1426 if (zz != yy) { 1427 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1428 } else { 1429 zarray = yarray; 1430 } 1431 1432 idx = a->j; 1433 v = a->a; 1434 if (usecprow){ 1435 if (zz != yy){ 1436 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1437 } 1438 mbs = a->compressedrow.nrows; 1439 ii = a->compressedrow.i; 1440 ridx = a->compressedrow.rindex; 1441 } else { 1442 ii = a->i; 1443 y = yarray; 1444 z = zarray; 1445 } 1446 1447 for (i=0; i<mbs; i++) { 1448 n = ii[1] - ii[0]; ii++; 1449 if (usecprow){ 1450 z = zarray + 7*ridx[i]; 1451 y = yarray + 7*ridx[i]; 1452 } 1453 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1454 for (j=0; j<n; j++) { 1455 xb = x + 7*(*idx++); 1456 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1457 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1458 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1459 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1460 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1461 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1462 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1463 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1464 v += 49; 1465 } 1466 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1467 if (!usecprow){ 1468 z += 7; y += 7; 1469 } 1470 } 1471 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1472 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1473 if (zz != yy) { 1474 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1475 } 1476 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1477 PetscFunctionReturn(0); 1478 } 1479 1480 #undef __FUNCT__ 1481 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1482 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1483 { 1484 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1485 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1486 MatScalar *v; 1487 PetscErrorCode ierr; 1488 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1489 PetscInt ncols,k,*ridx=PETSC_NULL; 1490 PetscBool usecprow=a->compressedrow.use; 1491 1492 PetscFunctionBegin; 1493 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1494 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1495 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1496 1497 idx = a->j; 1498 v = a->a; 1499 if (usecprow){ 1500 mbs = a->compressedrow.nrows; 1501 ii = a->compressedrow.i; 1502 ridx = a->compressedrow.rindex; 1503 } else { 1504 mbs = a->mbs; 1505 ii = a->i; 1506 z = zarray; 1507 } 1508 1509 if (!a->mult_work) { 1510 k = PetscMax(A->rmap->n,A->cmap->n); 1511 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1512 } 1513 work = a->mult_work; 1514 for (i=0; i<mbs; i++) { 1515 n = ii[1] - ii[0]; ii++; 1516 ncols = n*bs; 1517 workt = work; 1518 for (j=0; j<n; j++) { 1519 xb = x + bs*(*idx++); 1520 for (k=0; k<bs; k++) workt[k] = xb[k]; 1521 workt += bs; 1522 } 1523 if (usecprow) z = zarray + bs*ridx[i]; 1524 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1525 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1526 v += n*bs2; 1527 if (!usecprow){ 1528 z += bs; 1529 } 1530 } 1531 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1532 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1533 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1534 PetscFunctionReturn(0); 1535 } 1536 1537 #undef __FUNCT__ 1538 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1539 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1540 { 1541 PetscScalar zero = 0.0; 1542 PetscErrorCode ierr; 1543 1544 PetscFunctionBegin; 1545 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1546 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1547 PetscFunctionReturn(0); 1548 } 1549 1550 #undef __FUNCT__ 1551 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1552 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1553 { 1554 PetscScalar zero = 0.0; 1555 PetscErrorCode ierr; 1556 1557 PetscFunctionBegin; 1558 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1559 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1560 PetscFunctionReturn(0); 1561 } 1562 1563 #undef __FUNCT__ 1564 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1565 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1566 1567 { 1568 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1569 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1570 MatScalar *v; 1571 PetscErrorCode ierr; 1572 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1573 Mat_CompressedRow cprow = a->compressedrow; 1574 PetscBool usecprow=cprow.use; 1575 1576 PetscFunctionBegin; 1577 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1578 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1579 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1580 1581 idx = a->j; 1582 v = a->a; 1583 if (usecprow){ 1584 mbs = cprow.nrows; 1585 ii = cprow.i; 1586 ridx = cprow.rindex; 1587 } else { 1588 mbs=a->mbs; 1589 ii = a->i; 1590 xb = x; 1591 } 1592 1593 switch (bs) { 1594 case 1: 1595 for (i=0; i<mbs; i++) { 1596 if (usecprow) xb = x + ridx[i]; 1597 x1 = xb[0]; 1598 ib = idx + ii[0]; 1599 n = ii[1] - ii[0]; ii++; 1600 for (j=0; j<n; j++) { 1601 rval = ib[j]; 1602 z[rval] += PetscConj(*v) * x1; 1603 v++; 1604 } 1605 if (!usecprow) xb++; 1606 } 1607 break; 1608 case 2: 1609 for (i=0; i<mbs; i++) { 1610 if (usecprow) xb = x + 2*ridx[i]; 1611 x1 = xb[0]; x2 = xb[1]; 1612 ib = idx + ii[0]; 1613 n = ii[1] - ii[0]; ii++; 1614 for (j=0; j<n; j++) { 1615 rval = ib[j]*2; 1616 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1617 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1618 v += 4; 1619 } 1620 if (!usecprow) xb += 2; 1621 } 1622 break; 1623 case 3: 1624 for (i=0; i<mbs; i++) { 1625 if (usecprow) xb = x + 3*ridx[i]; 1626 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1627 ib = idx + ii[0]; 1628 n = ii[1] - ii[0]; ii++; 1629 for (j=0; j<n; j++) { 1630 rval = ib[j]*3; 1631 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1632 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1633 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1634 v += 9; 1635 } 1636 if (!usecprow) xb += 3; 1637 } 1638 break; 1639 case 4: 1640 for (i=0; i<mbs; i++) { 1641 if (usecprow) xb = x + 4*ridx[i]; 1642 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1643 ib = idx + ii[0]; 1644 n = ii[1] - ii[0]; ii++; 1645 for (j=0; j<n; j++) { 1646 rval = ib[j]*4; 1647 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1648 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1649 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1650 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1651 v += 16; 1652 } 1653 if (!usecprow) xb += 4; 1654 } 1655 break; 1656 case 5: 1657 for (i=0; i<mbs; i++) { 1658 if (usecprow) xb = x + 5*ridx[i]; 1659 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1660 x4 = xb[3]; x5 = xb[4]; 1661 ib = idx + ii[0]; 1662 n = ii[1] - ii[0]; ii++; 1663 for (j=0; j<n; j++) { 1664 rval = ib[j]*5; 1665 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1666 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1667 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1668 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1669 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1670 v += 25; 1671 } 1672 if (!usecprow) xb += 5; 1673 } 1674 break; 1675 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1676 PetscInt ncols,k; 1677 PetscScalar *work,*workt,*xtmp; 1678 1679 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1680 if (!a->mult_work) { 1681 k = PetscMax(A->rmap->n,A->cmap->n); 1682 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1683 } 1684 work = a->mult_work; 1685 xtmp = x; 1686 for (i=0; i<mbs; i++) { 1687 n = ii[1] - ii[0]; ii++; 1688 ncols = n*bs; 1689 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1690 if (usecprow) { 1691 xtmp = x + bs*ridx[i]; 1692 } 1693 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1694 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1695 v += n*bs2; 1696 if (!usecprow) xtmp += bs; 1697 workt = work; 1698 for (j=0; j<n; j++) { 1699 zb = z + bs*(*idx++); 1700 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1701 workt += bs; 1702 } 1703 } 1704 } 1705 } 1706 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1707 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1708 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1709 PetscFunctionReturn(0); 1710 } 1711 1712 #undef __FUNCT__ 1713 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1714 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1715 { 1716 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1717 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1718 MatScalar *v; 1719 PetscErrorCode ierr; 1720 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1721 Mat_CompressedRow cprow = a->compressedrow; 1722 PetscBool usecprow=cprow.use; 1723 1724 PetscFunctionBegin; 1725 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1726 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1727 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1728 1729 idx = a->j; 1730 v = a->a; 1731 if (usecprow){ 1732 mbs = cprow.nrows; 1733 ii = cprow.i; 1734 ridx = cprow.rindex; 1735 } else { 1736 mbs=a->mbs; 1737 ii = a->i; 1738 xb = x; 1739 } 1740 1741 switch (bs) { 1742 case 1: 1743 for (i=0; i<mbs; i++) { 1744 if (usecprow) xb = x + ridx[i]; 1745 x1 = xb[0]; 1746 ib = idx + ii[0]; 1747 n = ii[1] - ii[0]; ii++; 1748 for (j=0; j<n; j++) { 1749 rval = ib[j]; 1750 z[rval] += *v * x1; 1751 v++; 1752 } 1753 if (!usecprow) xb++; 1754 } 1755 break; 1756 case 2: 1757 for (i=0; i<mbs; i++) { 1758 if (usecprow) xb = x + 2*ridx[i]; 1759 x1 = xb[0]; x2 = xb[1]; 1760 ib = idx + ii[0]; 1761 n = ii[1] - ii[0]; ii++; 1762 for (j=0; j<n; j++) { 1763 rval = ib[j]*2; 1764 z[rval++] += v[0]*x1 + v[1]*x2; 1765 z[rval++] += v[2]*x1 + v[3]*x2; 1766 v += 4; 1767 } 1768 if (!usecprow) xb += 2; 1769 } 1770 break; 1771 case 3: 1772 for (i=0; i<mbs; i++) { 1773 if (usecprow) xb = x + 3*ridx[i]; 1774 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1775 ib = idx + ii[0]; 1776 n = ii[1] - ii[0]; ii++; 1777 for (j=0; j<n; j++) { 1778 rval = ib[j]*3; 1779 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1780 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1781 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1782 v += 9; 1783 } 1784 if (!usecprow) xb += 3; 1785 } 1786 break; 1787 case 4: 1788 for (i=0; i<mbs; i++) { 1789 if (usecprow) xb = x + 4*ridx[i]; 1790 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1791 ib = idx + ii[0]; 1792 n = ii[1] - ii[0]; ii++; 1793 for (j=0; j<n; j++) { 1794 rval = ib[j]*4; 1795 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1796 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1797 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1798 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1799 v += 16; 1800 } 1801 if (!usecprow) xb += 4; 1802 } 1803 break; 1804 case 5: 1805 for (i=0; i<mbs; i++) { 1806 if (usecprow) xb = x + 5*ridx[i]; 1807 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1808 x4 = xb[3]; x5 = xb[4]; 1809 ib = idx + ii[0]; 1810 n = ii[1] - ii[0]; ii++; 1811 for (j=0; j<n; j++) { 1812 rval = ib[j]*5; 1813 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1814 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1815 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1816 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1817 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1818 v += 25; 1819 } 1820 if (!usecprow) xb += 5; 1821 } 1822 break; 1823 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1824 PetscInt ncols,k; 1825 PetscScalar *work,*workt,*xtmp; 1826 1827 if (!a->mult_work) { 1828 k = PetscMax(A->rmap->n,A->cmap->n); 1829 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1830 } 1831 work = a->mult_work; 1832 xtmp = x; 1833 for (i=0; i<mbs; i++) { 1834 n = ii[1] - ii[0]; ii++; 1835 ncols = n*bs; 1836 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1837 if (usecprow) { 1838 xtmp = x + bs*ridx[i]; 1839 } 1840 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1841 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1842 v += n*bs2; 1843 if (!usecprow) xtmp += bs; 1844 workt = work; 1845 for (j=0; j<n; j++) { 1846 zb = z + bs*(*idx++); 1847 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1848 workt += bs; 1849 } 1850 } 1851 } 1852 } 1853 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1854 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1855 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1856 PetscFunctionReturn(0); 1857 } 1858 1859 #undef __FUNCT__ 1860 #define __FUNCT__ "MatScale_SeqBAIJ" 1861 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1862 { 1863 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1864 PetscInt totalnz = a->bs2*a->nz; 1865 PetscScalar oalpha = alpha; 1866 PetscErrorCode ierr; 1867 PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz); 1868 1869 PetscFunctionBegin; 1870 BLASscal_(&tnz,&oalpha,a->a,&one); 1871 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1872 PetscFunctionReturn(0); 1873 } 1874 1875 #undef __FUNCT__ 1876 #define __FUNCT__ "MatNorm_SeqBAIJ" 1877 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1878 { 1879 PetscErrorCode ierr; 1880 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1881 MatScalar *v = a->a; 1882 PetscReal sum = 0.0; 1883 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1884 1885 PetscFunctionBegin; 1886 if (type == NORM_FROBENIUS) { 1887 for (i=0; i< bs2*nz; i++) { 1888 #if defined(PETSC_USE_COMPLEX) 1889 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1890 #else 1891 sum += (*v)*(*v); v++; 1892 #endif 1893 } 1894 *norm = sqrt(sum); 1895 } else if (type == NORM_1) { /* maximum column sum */ 1896 PetscReal *tmp; 1897 PetscInt *bcol = a->j; 1898 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1899 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1900 for (i=0; i<nz; i++){ 1901 for (j=0; j<bs; j++){ 1902 k1 = bs*(*bcol) + j; /* column index */ 1903 for (k=0; k<bs; k++){ 1904 tmp[k1] += PetscAbsScalar(*v); v++; 1905 } 1906 } 1907 bcol++; 1908 } 1909 *norm = 0.0; 1910 for (j=0; j<A->cmap->n; j++) { 1911 if (tmp[j] > *norm) *norm = tmp[j]; 1912 } 1913 ierr = PetscFree(tmp);CHKERRQ(ierr); 1914 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1915 *norm = 0.0; 1916 for (k=0; k<bs; k++) { 1917 for (j=0; j<a->mbs; j++) { 1918 v = a->a + bs2*a->i[j] + k; 1919 sum = 0.0; 1920 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1921 for (k1=0; k1<bs; k1++){ 1922 sum += PetscAbsScalar(*v); 1923 v += bs; 1924 } 1925 } 1926 if (sum > *norm) *norm = sum; 1927 } 1928 } 1929 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1930 PetscFunctionReturn(0); 1931 } 1932 1933 1934 #undef __FUNCT__ 1935 #define __FUNCT__ "MatEqual_SeqBAIJ" 1936 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1937 { 1938 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1939 PetscErrorCode ierr; 1940 1941 PetscFunctionBegin; 1942 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1943 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1944 *flg = PETSC_FALSE; 1945 PetscFunctionReturn(0); 1946 } 1947 1948 /* if the a->i are the same */ 1949 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1950 if (!*flg) { 1951 PetscFunctionReturn(0); 1952 } 1953 1954 /* if a->j are the same */ 1955 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1956 if (!*flg) { 1957 PetscFunctionReturn(0); 1958 } 1959 /* if a->a are the same */ 1960 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1961 PetscFunctionReturn(0); 1962 1963 } 1964 1965 #undef __FUNCT__ 1966 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1967 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1968 { 1969 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1970 PetscErrorCode ierr; 1971 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1972 PetscScalar *x,zero = 0.0; 1973 MatScalar *aa,*aa_j; 1974 1975 PetscFunctionBegin; 1976 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1977 bs = A->rmap->bs; 1978 aa = a->a; 1979 ai = a->i; 1980 aj = a->j; 1981 ambs = a->mbs; 1982 bs2 = a->bs2; 1983 1984 ierr = VecSet(v,zero);CHKERRQ(ierr); 1985 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1986 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1987 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1988 for (i=0; i<ambs; i++) { 1989 for (j=ai[i]; j<ai[i+1]; j++) { 1990 if (aj[j] == i) { 1991 row = i*bs; 1992 aa_j = aa+j*bs2; 1993 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1994 break; 1995 } 1996 } 1997 } 1998 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1999 PetscFunctionReturn(0); 2000 } 2001 2002 #undef __FUNCT__ 2003 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2004 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2005 { 2006 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2007 const PetscScalar *l,*r,*li,*ri; 2008 PetscScalar x; 2009 MatScalar *aa, *v; 2010 PetscErrorCode ierr; 2011 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2012 const PetscInt *ai,*aj; 2013 2014 PetscFunctionBegin; 2015 ai = a->i; 2016 aj = a->j; 2017 aa = a->a; 2018 m = A->rmap->n; 2019 n = A->cmap->n; 2020 bs = A->rmap->bs; 2021 mbs = a->mbs; 2022 bs2 = a->bs2; 2023 if (ll) { 2024 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2025 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2026 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2027 for (i=0; i<mbs; i++) { /* for each block row */ 2028 M = ai[i+1] - ai[i]; 2029 li = l + i*bs; 2030 v = aa + bs2*ai[i]; 2031 for (j=0; j<M; j++) { /* for each block */ 2032 for (k=0; k<bs2; k++) { 2033 (*v++) *= li[k%bs]; 2034 } 2035 } 2036 } 2037 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2038 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2039 } 2040 2041 if (rr) { 2042 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2043 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2044 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2045 for (i=0; i<mbs; i++) { /* for each block row */ 2046 iai = ai[i]; 2047 M = ai[i+1] - iai; 2048 v = aa + bs2*iai; 2049 for (j=0; j<M; j++) { /* for each block */ 2050 ri = r + bs*aj[iai+j]; 2051 for (k=0; k<bs; k++) { 2052 x = ri[k]; 2053 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2054 v += bs; 2055 } 2056 } 2057 } 2058 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2059 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2060 } 2061 PetscFunctionReturn(0); 2062 } 2063 2064 2065 #undef __FUNCT__ 2066 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2067 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2068 { 2069 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2070 2071 PetscFunctionBegin; 2072 info->block_size = a->bs2; 2073 info->nz_allocated = a->bs2*a->maxnz; 2074 info->nz_used = a->bs2*a->nz; 2075 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2076 info->assemblies = A->num_ass; 2077 info->mallocs = A->info.mallocs; 2078 info->memory = ((PetscObject)A)->mem; 2079 if (A->factortype) { 2080 info->fill_ratio_given = A->info.fill_ratio_given; 2081 info->fill_ratio_needed = A->info.fill_ratio_needed; 2082 info->factor_mallocs = A->info.factor_mallocs; 2083 } else { 2084 info->fill_ratio_given = 0; 2085 info->fill_ratio_needed = 0; 2086 info->factor_mallocs = 0; 2087 } 2088 PetscFunctionReturn(0); 2089 } 2090 2091 2092 #undef __FUNCT__ 2093 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2094 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2095 { 2096 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2097 PetscErrorCode ierr; 2098 2099 PetscFunctionBegin; 2100 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2101 PetscFunctionReturn(0); 2102 } 2103