#include <../src/mat/impls/baij/seq/baij.h> #include <../src/mat/impls/dense/seq/dense.h> #include #include #include #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) #include #endif PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; const PetscInt *idx; PetscInt start,end,*ai,*aj,bs,*nidx2; PetscBT table; PetscFunctionBegin; m = a->mbs; ai = a->i; aj = a->j; bs = A->rmap->bs; PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); PetscCall(PetscBTCreate(m,&table)); PetscCall(PetscMalloc1(m+1,&nidx)); PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2)); for (i=0; idata,*c; PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; const PetscInt *irow,*icol; PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; PetscInt *aj = a->j,*ai = a->i; MatScalar *mat_a; Mat C; PetscBool flag; PetscFunctionBegin; PetscCall(ISGetIndices(isrow,&irow)); PetscCall(ISGetIndices(iscol,&icol)); PetscCall(ISGetLocalSize(isrow,&nrows)); PetscCall(ISGetLocalSize(iscol,&ncols)); PetscCall(PetscCalloc1(1+oldcols,&smap)); ssmap = smap; PetscCall(PetscMalloc1(1+nrows,&lens)); for (i=0; iilen[irow[i]]; lens[i] = 0; for (k=kstart; kdata); PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag)); PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); PetscCall(PetscArrayzero(c->ilen,c->mbs)); C = *B; } else { PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C)); PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE)); PetscCall(MatSetType(C,((PetscObject)A)->type_name)); PetscCall(MatSeqBAIJSetPreallocation(C,bs,0,lens)); } c = (Mat_SeqBAIJ*)(C->data); for (i=0; iilen[row]; mat_i = c->i[i]; mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ mat_a = c->a ? c->a + mat_i*bs2 : NULL; /* mustn't add to NULL, that is UB */ mat_ilen = c->ilen + i; for (k=kstart; kj[k]])) { *mat_j++ = tcol - 1; PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2)); mat_a += bs2; (*mat_ilen)++; } } } /* sort */ if (c->j && c->a) { MatScalar *work; PetscCall(PetscMalloc1(bs2,&work)); for (i=0; ii[i]; mat_j = c->j + mat_i; mat_a = c->a + mat_i*bs2; ilen = c->ilen[i]; PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work)); } PetscCall(PetscFree(work)); } /* Free work space */ PetscCall(ISRestoreIndices(iscol,&icol)); PetscCall(PetscFree(smap)); PetscCall(PetscFree(lens)); PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); PetscCall(ISRestoreIndices(isrow,&irow)); *B = C; PetscFunctionReturn(0); } PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; IS is1,is2; PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; const PetscInt *irow,*icol; PetscFunctionBegin; PetscCall(ISGetIndices(isrow,&irow)); PetscCall(ISGetIndices(iscol,&icol)); PetscCall(ISGetLocalSize(isrow,&nrows)); PetscCall(ISGetLocalSize(iscol,&ncols)); /* Verify if the indices corespond to each element in a block and form the IS with compressed IS */ maxmnbs = PetscMax(a->mbs,a->nbs); PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary)); PetscCall(PetscArrayzero(vary,a->mbs)); for (i=0; imbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); } count = 0; for (i=0; inbs)); for (i=0; inbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); } count = 0; for (i=0; idata; Mat_SubSppt *submatj = c->submatis1; PetscFunctionBegin; PetscCall((*submatj->destroy)(C)); PetscCall(MatDestroySubMatrix_Private(submatj)); PetscFunctionReturn(0); } PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) { PetscInt i; Mat C; Mat_SeqBAIJ *c; Mat_SubSppt *submatj; PetscFunctionBegin; for (i=0; idata; submatj = c->submatis1; if (submatj) { if (--((PetscObject)C)->refct <= 0) { PetscCall((*submatj->destroy)(C)); PetscCall(MatDestroySubMatrix_Private(submatj)); PetscCall(PetscFree(C->defaultvectype)); PetscCall(PetscLayoutDestroy(&C->rmap)); PetscCall(PetscLayoutDestroy(&C->cmap)); PetscCall(PetscHeaderDestroy(&C)); } } else { PetscCall(MatDestroy(&C)); } } /* Destroy Dummy submatrices created for reuse */ PetscCall(MatDestroySubMatrices_Dummy(n,mat)); PetscCall(PetscFree(*mat)); PetscFunctionReturn(0); } PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) { PetscInt i; PetscFunctionBegin; if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscCalloc1(n+1,B)); } for (i=0; idata; PetscScalar *z,sum; const PetscScalar *x; const MatScalar *v; PetscInt mbs,i,n; const PetscInt *idx,*ii,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&z)); if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(z,a->mbs)); } else { mbs = a->mbs; ii = a->i; } for (i=0; ia + ii[0]; idx = a->j + ii[0]; ii++; PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ sum = 0.0; PetscSparseDensePlusDot(sum,x,v,idx,n); if (usecprow) { z[ridx[i]] = sum; } else { z[i] = sum; } } PetscCall(VecRestoreArrayRead(xx,&x)); PetscCall(VecRestoreArrayWrite(zz,&z)); PetscCall(PetscLogFlops(2.0*a->nz - a->nonzerorowcnt)); PetscFunctionReturn(0); } PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,*zarray; const PetscScalar *x,*xb; PetscScalar x1,x2; const MatScalar *v; PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,2*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 2.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray; const PetscScalar *x,*xb; const MatScalar *v; PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; #if defined(PETSC_HAVE_PRAGMA_DISJOINT) #pragma disjoint(*v,*z,*xb) #endif PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,3*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 3.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; const PetscScalar *x,*xb; const MatScalar *v; PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,4*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 4.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray; const PetscScalar *xb,*x; const MatScalar *v; const PetscInt *idx,*ii,*ridx=NULL; PetscInt mbs,i,j,n; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,5*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 5.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,x5,x6,*zarray; const MatScalar *v; PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,6*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 6.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; const MatScalar *v; PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,7*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 7.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,*work,*workt,*zarray; const PetscScalar *x,*xb; const MatScalar *v; PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; const PetscInt *idx,*ii,*ridx=NULL; PetscInt k; PetscBool usecprow=a->compressedrow.use; __m256d a0,a1,a2,a3,a4,a5; __m256d w0,w1,w2,w3; __m256d z0,z1,z2; __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,bs*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } if (!a->mult_work) { k = PetscMax(A->rmap->n,A->cmap->n); PetscCall(PetscMalloc1(k+1,&a->mult_work)); } work = a->mult_work; for (i=0; inz*bs2 - bs*a->nonzerorowcnt)); PetscFunctionReturn(0); } #endif PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; const PetscScalar *x,*xb; PetscScalar *zarray,xv; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx; PetscInt mbs,i,j,k,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,11*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 11.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; const PetscScalar *x,*xb; PetscScalar *zarray,xv; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx; PetscInt mbs,i,j,k,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,12*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 12.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; const PetscScalar *x,*xb; PetscScalar *zarray,*yarray,xv; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx; PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,12*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz - 12.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,*zarray; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; PetscInt mbs,i,j,n; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,12*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 12.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,*zarray,*yarray; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; PetscInt mbs = a->mbs,i,j,n; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,12*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz - 12.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,*zarray; const PetscScalar *x,*work; const MatScalar *v = a->a; PetscInt mbs,i,j,n; const PetscInt *idx = a->j,*ii,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; const PetscInt bs = 12, bs2 = 144; __m256d a0,a1,a2,a3,a4,a5; __m256d w0,w1,w2,w3; __m256d z0,z1,z2; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,bs*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz*bs2 - bs*a->nonzerorowcnt)); PetscFunctionReturn(0); } #endif /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ /* Default MatMult for block size 15 */ PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; const PetscScalar *x,*xb; PetscScalar *zarray,xv; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx; PetscInt mbs,i,j,k,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,15*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 15.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,*zarray; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx; PetscInt mbs,i,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,15*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 15.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx; PetscInt mbs,i,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,15*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 15.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; const MatScalar *v; const PetscInt *ii,*ij=a->j,*idx; PetscInt mbs,i,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,15*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i=0; inz - 15.0*a->nonzerorowcnt)); PetscFunctionReturn(0); } /* This will not work with MatScalar == float because it calls the BLAS */ PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,*work,*workt,*zarray; const PetscScalar *x,*xb; const MatScalar *v; PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; const PetscInt *idx,*ii,*ridx=NULL; PetscInt ncols,k; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayWrite(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray,bs*a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } if (!a->mult_work) { k = PetscMax(A->rmap->n,A->cmap->n); PetscCall(PetscMalloc1(k+1,&a->mult_work)); } work = a->mult_work; for (i=0; inz*bs2 - bs*a->nonzerorowcnt)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; const PetscScalar *x; PetscScalar *y,*z,sum; const MatScalar *v; PetscInt mbs=a->mbs,i,n,*ridx=NULL; const PetscInt *idx,*ii; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&y,&z)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(z,y,mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; } for (i=0; inz)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *y = NULL,*z = NULL,sum1,sum2; const PetscScalar *x,*xb; PetscScalar x1,x2,*yarray,*zarray; const MatScalar *v; PetscInt mbs = a->mbs,i,n,j; const PetscInt *idx,*ii,*ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,2*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; const PetscScalar *x,*xb; const MatScalar *v; PetscInt mbs = a->mbs,i,j,n; const PetscInt *idx,*ii,*ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,3*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; const PetscScalar *x,*xb; const MatScalar *v; PetscInt mbs = a->mbs,i,j,n; const PetscInt *idx,*ii,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,4*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; const PetscScalar *x,*xb; PetscScalar *yarray,*zarray; const MatScalar *v; PetscInt mbs = a->mbs,i,j,n; const PetscInt *idx,*ii,*ridx = NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,5*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; const MatScalar *v; PetscInt mbs = a->mbs,i,j,n; const PetscInt *idx,*ii,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,6*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; const MatScalar *v; PetscInt mbs = a->mbs,i,j,n; const PetscInt *idx,*ii,*ridx = NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,7*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz)); PetscFunctionReturn(0); } #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,*work,*workt,*zarray; const PetscScalar *x,*xb; const MatScalar *v; PetscInt mbs,i,j,n; PetscInt k; PetscBool usecprow=a->compressedrow.use; const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81; __m256d a0,a1,a2,a3,a4,a5; __m256d w0,w1,w2,w3; __m256d z0,z1,z2; __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); PetscFunctionBegin; PetscCall(VecCopy(yy,zz)); PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArray(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = zarray; } if (!a->mult_work) { k = PetscMax(A->rmap->n,A->cmap->n); PetscCall(PetscMalloc1(k+1,&a->mult_work)); } work = a->mult_work; for (i=0; inz)); PetscFunctionReturn(0); } #endif PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; const PetscScalar *x,*xb; PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; const MatScalar *v; PetscInt mbs = a->mbs,i,j,n; const PetscInt *idx,*ii,*ridx = NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) { PetscCall(PetscArraycpy(zarray,yarray,7*mbs)); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i=0; inz)); PetscFunctionReturn(0); } PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,*work,*workt,*zarray; const PetscScalar *x,*xb; const MatScalar *v; PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; PetscInt ncols,k; const PetscInt *ridx = NULL,*idx,*ii; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecCopy(yy,zz)); PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArray(zz,&zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = zarray; } if (!a->mult_work) { k = PetscMax(A->rmap->n,A->cmap->n); PetscCall(PetscMalloc1(k+1,&a->mult_work)); } work = a->mult_work; for (i=0; inz*bs2)); PetscFunctionReturn(0); } PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) { PetscScalar zero = 0.0; PetscFunctionBegin; PetscCall(VecSet(zz,zero)); PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz)); PetscFunctionReturn(0); } PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) { PetscScalar zero = 0.0; PetscFunctionBegin; PetscCall(VecSet(zz,zero)); PetscCall(MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz)); PetscFunctionReturn(0); } PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z,x1,x2,x3,x4,x5; const PetscScalar *x,*xb = NULL; const MatScalar *v; PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; const PetscInt *idx,*ii,*ib,*ridx = NULL; Mat_CompressedRow cprow = a->compressedrow; PetscBool usecprow = cprow.use; PetscFunctionBegin; if (yy != zz) PetscCall(VecCopy(yy,zz)); PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArray(zz,&z)); idx = a->j; v = a->a; if (usecprow) { mbs = cprow.nrows; ii = cprow.i; ridx = cprow.rindex; } else { mbs=a->mbs; ii = a->i; xb = x; } switch (bs) { case 1: for (i=0; ibs2; PetscScalar *work,*workt,zb; const PetscScalar *xtmp; if (!a->mult_work) { k = PetscMax(A->rmap->n,A->cmap->n); PetscCall(PetscMalloc1(k+1,&a->mult_work)); } work = a->mult_work; xtmp = x; for (i=0; inz*a->bs2)); PetscFunctionReturn(0); } PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *zb,*z,x1,x2,x3,x4,x5; const PetscScalar *x,*xb = NULL; const MatScalar *v; PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; const PetscInt *idx,*ii,*ib,*ridx = NULL; Mat_CompressedRow cprow = a->compressedrow; PetscBool usecprow=cprow.use; PetscFunctionBegin; if (yy != zz) PetscCall(VecCopy(yy,zz)); PetscCall(VecGetArrayRead(xx,&x)); PetscCall(VecGetArray(zz,&z)); idx = a->j; v = a->a; if (usecprow) { mbs = cprow.nrows; ii = cprow.i; ridx = cprow.rindex; } else { mbs=a->mbs; ii = a->i; xb = x; } switch (bs) { case 1: for (i=0; imult_work) { k = PetscMax(A->rmap->n,A->cmap->n); PetscCall(PetscMalloc1(k+1,&a->mult_work)); } work = a->mult_work; xtmp = x; for (i=0; inz*a->bs2)); PetscFunctionReturn(0); } PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; PetscInt totalnz = a->bs2*a->nz; PetscScalar oalpha = alpha; PetscBLASInt one = 1,tnz; PetscFunctionBegin; PetscCall(PetscBLASIntCast(totalnz,&tnz)); PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); PetscCall(PetscLogFlops(totalnz)); PetscFunctionReturn(0); } PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; MatScalar *v = a->a; PetscReal sum = 0.0; PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; PetscFunctionBegin; if (type == NORM_FROBENIUS) { #if defined(PETSC_USE_REAL___FP16) PetscBLASInt one = 1,cnt = bs2*nz; PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one)); #else for (i=0; ij; PetscCall(PetscCalloc1(A->cmap->n+1,&tmp)); for (i=0; icmap->n; j++) { if (tmp[j] > *norm) *norm = tmp[j]; } PetscCall(PetscFree(tmp)); PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0))); } else if (type == NORM_INFINITY) { /* maximum row sum */ *norm = 0.0; for (k=0; kmbs; j++) { v = a->a + bs2*a->i[j] + k; sum = 0.0; for (i=0; ii[j+1]-a->i[j]; i++) { for (k1=0; k1 *norm) *norm = sum; } } PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0))); } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); PetscFunctionReturn(0); } PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; PetscFunctionBegin; /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { *flg = PETSC_FALSE; PetscFunctionReturn(0); } /* if the a->i are the same */ PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg)); if (!*flg) PetscFunctionReturn(0); /* if a->j are the same */ PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg)); if (!*flg) PetscFunctionReturn(0); /* if a->a are the same */ PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg)); PetscFunctionReturn(0); } PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; PetscScalar *x,zero = 0.0; MatScalar *aa,*aa_j; PetscFunctionBegin; PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); bs = A->rmap->bs; aa = a->a; ai = a->i; aj = a->j; ambs = a->mbs; bs2 = a->bs2; PetscCall(VecSet(v,zero)); PetscCall(VecGetArray(v,&x)); PetscCall(VecGetLocalSize(v,&n)); PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); for (i=0; idata; const PetscScalar *l,*r,*li,*ri; PetscScalar x; MatScalar *aa, *v; PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; const PetscInt *ai,*aj; PetscFunctionBegin; ai = a->i; aj = a->j; aa = a->a; m = A->rmap->n; n = A->cmap->n; bs = A->rmap->bs; mbs = a->mbs; bs2 = a->bs2; if (ll) { PetscCall(VecGetArrayRead(ll,&l)); PetscCall(VecGetLocalSize(ll,&lm)); PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); for (i=0; inz)); } if (rr) { PetscCall(VecGetArrayRead(rr,&r)); PetscCall(VecGetLocalSize(rr,&rn)); PetscCheck(rn == n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); for (i=0; inz)); } PetscFunctionReturn(0); } PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscFunctionBegin; info->block_size = a->bs2; info->nz_allocated = a->bs2*a->maxnz; info->nz_used = a->bs2*a->nz; info->nz_unneeded = info->nz_allocated - info->nz_used; info->assemblies = A->num_ass; info->mallocs = A->info.mallocs; info->memory = ((PetscObject)A)->mem; if (A->factortype) { info->fill_ratio_given = A->info.fill_ratio_given; info->fill_ratio_needed = A->info.fill_ratio_needed; info->factor_mallocs = A->info.factor_mallocs; } else { info->fill_ratio_given = 0; info->fill_ratio_needed = 0; info->factor_mallocs = 0; } PetscFunctionReturn(0); } PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscFunctionBegin; PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs])); PetscFunctionReturn(0); } PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) { PetscFunctionBegin; PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C)); C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; PetscFunctionReturn(0); } PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; PetscScalar *z = NULL,sum1; const PetscScalar *xb; PetscScalar x1; const MatScalar *v,*vv; PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i=0; idata; PetscScalar *z = NULL,sum1,sum2; const PetscScalar *xb; PetscScalar x1,x2; const MatScalar *v,*vv; PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i=0; idata; PetscScalar *z = NULL,sum1,sum2,sum3; const PetscScalar *xb; PetscScalar x1,x2,x3; const MatScalar *v,*vv; PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i=0; idata; PetscScalar *z = NULL,sum1,sum2,sum3,sum4; const PetscScalar *xb; PetscScalar x1,x2,x3,x4; const MatScalar *v,*vv; PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i=0; idata; PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5; const PetscScalar *xb; PetscScalar x1,x2,x3,x4,x5; const MatScalar *v,*vv; PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i=0; idata; Mat_SeqDense *bd = (Mat_SeqDense*)B->data; Mat_SeqDense *cd = (Mat_SeqDense*)C->data; PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; PetscBLASInt bbs,bcn,bbm,bcm; PetscScalar *z = NULL; PetscScalar *c,*b; const MatScalar *v; const PetscInt *idx,*ii,*ridx=NULL; PetscScalar _DZero=0.0,_DOne=1.0; PetscBool usecprow=a->compressedrow.use; PetscFunctionBegin; if (!cm || !cn) PetscFunctionReturn(0); PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); b = bd->v; if (a->nonzerorowcnt != A->rmap->n) { PetscCall(MatZeroEntries(C)); } PetscCall(MatDenseGetArray(C,&c)); switch (bs) { case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break; case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break; case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break; case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break; case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break; default: /* block sizes larger than 5 by 5 are handled by BLAS */ PetscCall(PetscBLASIntCast(bs,&bbs)); PetscCall(PetscBLASIntCast(cn,&bcn)); PetscCall(PetscBLASIntCast(bm,&bbm)); PetscCall(PetscBLASIntCast(cm,&bcm)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i=0; inz*bs2 - bs*a->nonzerorowcnt)*cn)); PetscFunctionReturn(0); }