/* Factorization code for BAIJ format. */ #include <../src/mat/impls/baij/seq/baij.h> #include /* ------------------------------------------------------------*/ /* Version for when blocks are 4 by 4 */ #undef __FUNCT__ #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; IS isrow = b->row,isicol = b->icol; PetscErrorCode ierr; const PetscInt *r,*ic; PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; PetscInt *ajtmpold,*ajtmp,nz,row; PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; MatScalar *pv,*v,*rtmp,*pc,*w,*x; MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; MatScalar m13,m14,m15,m16; MatScalar *ba = b->a,*aa = a->a; PetscBool pivotinblocks = b->pivotinblocks; PetscReal shift = info->shiftamount; PetscFunctionBegin; ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); for (i=0; ia */ pv = ba + 16*bi[i]; pj = bj + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; jops->solve = MatSolve_SeqBAIJ_4_inplace; C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; C->assembled = PETSC_TRUE; ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ PetscFunctionReturn(0); } /* MatLUFactorNumeric_SeqBAIJ_4 - copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented PetscKernel_A_gets_A_times_B() PetscKernel_A_gets_A_minus_B_times_C() PetscKernel_A_gets_inverse_A() */ #undef __FUNCT__ #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) { Mat C = B; Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; IS isrow = b->row,isicol = b->icol; PetscErrorCode ierr; const PetscInt *r,*ic; PetscInt i,j,k,nz,nzL,row; const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; PetscInt flg; PetscReal shift; PetscFunctionBegin; ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); if (info->shifttype == MAT_SHIFT_NONE) { shift = 0; } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ shift = info->shiftamount; } /* generate work space needed by the factorization */ ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); for (i=0; ia + bs2*bdiag[row]; /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ pv = b->a + bs2*(bdiag[row+1]+1); nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ for (j=0; ja */ /* L part */ pv = b->a + bs2*bi[i]; pj = b->j + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; ja + bs2*bdiag[i]; pj = b->j + bdiag[i]; ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); /* U part */ pv = b->a + bs2*(bdiag[i+1]+1); pj = b->j + bdiag[i+1]+1; nz = bdiag[i] - bdiag[i+1] - 1; for (j=0; jops->solve = MatSolve_SeqBAIJ_4; C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; C->assembled = PETSC_TRUE; ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) { /* Default Version for when blocks are 4 by 4 Using natural ordering */ Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; PetscErrorCode ierr; PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; PetscInt *ajtmpold,*ajtmp,nz,row; PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; MatScalar *pv,*v,*rtmp,*pc,*w,*x; MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; MatScalar m13,m14,m15,m16; MatScalar *ba = b->a,*aa = a->a; PetscBool pivotinblocks = b->pivotinblocks; PetscReal shift = info->shiftamount; PetscFunctionBegin; ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); for (i=0; ia */ pv = ba + 16*bi[i]; pj = bj + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; jops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; C->assembled = PETSC_TRUE; ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ PetscFunctionReturn(0); } /* MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() */ #undef __FUNCT__ #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) { Mat C =B; Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; PetscErrorCode ierr; PetscInt i,j,k,nz,nzL,row; const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; PetscInt flg; PetscReal shift; PetscFunctionBegin; /* generate work space needed by the factorization */ ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); if (info->shifttype == MAT_SHIFT_NONE) { shift = 0; } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ shift = info->shiftamount; } for (i=0; ia + bs2*bdiag[row]; /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ pv = b->a + bs2*(bdiag[row+1]+1); nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ for (j=0; ja */ /* L part */ pv = b->a + bs2*bi[i]; pj = b->j + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; ja + bs2*bdiag[i]; pj = b->j + bdiag[i]; ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); /* U part */ pv = b->a + bs2*(bdiag[i+1]+1); pj = b->j + bdiag[i+1]+1; nz = bdiag[i] - bdiag[i+1] - 1; for (j=0; jops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; C->assembled = PETSC_TRUE; ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ PetscFunctionReturn(0); } #if defined(PETSC_HAVE_SSE) #include PETSC_HAVE_SSE /* SSE Version for when blocks are 4 by 4 Using natural ordering */ #undef __FUNCT__ #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; PetscErrorCode ierr; int i,j,n = a->mbs; int *bj = b->j,*bjtmp,*pj; int row; int *ajtmpold,nz,*bi=b->i; int *diag_offset = b->diag,*ai=a->i,*aj=a->j; MatScalar *pv,*v,*rtmp,*pc,*w,*x; MatScalar *ba = b->a,*aa = a->a; int nonzero=0; /* int nonzero=0,colscale = 16; */ PetscBool pivotinblocks = b->pivotinblocks; PetscReal shift = info->shiftamount; PetscFunctionBegin; SSE_SCOPE_BEGIN; if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); /* if ((unsigned long)bj==(unsigned long)aj) { */ /* colscale = 4; */ /* } */ for (i=0; iXMM3 */ /* Column 3, product is accumulated in XMM0 */ SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM0,XMM7) SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM1,XMM7) SSE_ADD_PS(XMM0,XMM1) SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) SSE_SHUFFLE(XMM1,XMM1,0x00) SSE_MULT_PS(XMM1,XMM2) SSE_ADD_PS(XMM0,XMM1) SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM3,XMM7) SSE_ADD_PS(XMM0,XMM3) SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ /* This is code to be maintained and read by humans afterall. */ /* Copy Multiplier Col 3 into XMM3 */ SSE_COPY_PS(XMM3,XMM0) /* Copy Multiplier Col 2 into XMM2 */ SSE_COPY_PS(XMM2,XMM6) /* Copy Multiplier Col 1 into XMM1 */ SSE_COPY_PS(XMM1,XMM5) /* Copy Multiplier Col 0 into XMM0 */ SSE_COPY_PS(XMM0,XMM4) SSE_INLINE_END_2; /* Update the row: */ nz = bi[row+1] - diag_offset[row] - 1; pv += 16; for (j=0; ja */ pv = ba + 16*bi[i]; pj = bj + bi[i]; nz = bi[i+1] - bi[i]; /* Copy x block back into pv block */ for (j=0; jops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; C->assembled = PETSC_TRUE; ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* Flop Count from inverting diagonal blocks */ SSE_SCOPE_END; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) { Mat A =C; Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; PetscErrorCode ierr; int i,j,n = a->mbs; unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; unsigned short *aj = (unsigned short*)(a->j),*ajtmp; unsigned int row; int nz,*bi=b->i; int *diag_offset = b->diag,*ai=a->i; MatScalar *pv,*v,*rtmp,*pc,*w,*x; MatScalar *ba = b->a,*aa = a->a; int nonzero=0; /* int nonzero=0,colscale = 16; */ PetscBool pivotinblocks = b->pivotinblocks; PetscReal shift = info->shiftamount; PetscFunctionBegin; SSE_SCOPE_BEGIN; if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); /* if ((unsigned long)bj==(unsigned long)aj) { */ /* colscale = 4; */ /* } */ for (i=0; iXMM3 */ /* Column 3, product is accumulated in XMM0 */ SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM0,XMM7) SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM1,XMM7) SSE_ADD_PS(XMM0,XMM1) SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) SSE_SHUFFLE(XMM1,XMM1,0x00) SSE_MULT_PS(XMM1,XMM2) SSE_ADD_PS(XMM0,XMM1) SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM3,XMM7) SSE_ADD_PS(XMM0,XMM3) SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ /* This is code to be maintained and read by humans afterall. */ /* Copy Multiplier Col 3 into XMM3 */ SSE_COPY_PS(XMM3,XMM0) /* Copy Multiplier Col 2 into XMM2 */ SSE_COPY_PS(XMM2,XMM6) /* Copy Multiplier Col 1 into XMM1 */ SSE_COPY_PS(XMM1,XMM5) /* Copy Multiplier Col 0 into XMM0 */ SSE_COPY_PS(XMM0,XMM4) SSE_INLINE_END_2; /* Update the row: */ nz = bi[row+1] - diag_offset[row] - 1; pv += 16; for (j=0; ja */ pv = ba + 16*bi[i]; pj = bj + bi[i]; nz = bi[i+1] - bi[i]; /* Copy x block back into pv block */ for (j=0; jops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; C->assembled = PETSC_TRUE; ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* Flop Count from inverting diagonal blocks */ SSE_SCOPE_END; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; PetscErrorCode ierr; int i,j,n = a->mbs; unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; unsigned int row; int *ajtmpold,nz,*bi=b->i; int *diag_offset = b->diag,*ai=a->i,*aj=a->j; MatScalar *pv,*v,*rtmp,*pc,*w,*x; MatScalar *ba = b->a,*aa = a->a; int nonzero=0; /* int nonzero=0,colscale = 16; */ PetscBool pivotinblocks = b->pivotinblocks; PetscReal shift = info->shiftamount; PetscFunctionBegin; SSE_SCOPE_BEGIN; if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); /* if ((unsigned long)bj==(unsigned long)aj) { */ /* colscale = 4; */ /* } */ if ((unsigned long)bj==(unsigned long)aj) { return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); } for (i=0; iXMM3 */ /* Column 3, product is accumulated in XMM0 */ SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM0,XMM7) SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM1,XMM7) SSE_ADD_PS(XMM0,XMM1) SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) SSE_SHUFFLE(XMM1,XMM1,0x00) SSE_MULT_PS(XMM1,XMM2) SSE_ADD_PS(XMM0,XMM1) SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) SSE_SHUFFLE(XMM7,XMM7,0x00) SSE_MULT_PS(XMM3,XMM7) SSE_ADD_PS(XMM0,XMM3) SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ /* This is code to be maintained and read by humans afterall. */ /* Copy Multiplier Col 3 into XMM3 */ SSE_COPY_PS(XMM3,XMM0) /* Copy Multiplier Col 2 into XMM2 */ SSE_COPY_PS(XMM2,XMM6) /* Copy Multiplier Col 1 into XMM1 */ SSE_COPY_PS(XMM1,XMM5) /* Copy Multiplier Col 0 into XMM0 */ SSE_COPY_PS(XMM0,XMM4) SSE_INLINE_END_2; /* Update the row: */ nz = bi[row+1] - diag_offset[row] - 1; pv += 16; for (j=0; ja */ pv = ba + 16*bi[i]; pj = bj + bi[i]; nz = bi[i+1] - bi[i]; /* Copy x block back into pv block */ for (j=0; jops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; C->assembled = PETSC_TRUE; ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* Flop Count from inverting diagonal blocks */ SSE_SCOPE_END; PetscFunctionReturn(0); } #endif