1*9ae82921SPaul Mullowney /* 2*9ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3*9ae82921SPaul Mullowney matrix storage format. 4*9ae82921SPaul Mullowney */ 5*9ae82921SPaul Mullowney 6*9ae82921SPaul Mullowney #include "petscconf.h" 7*9ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN 8*9ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9*9ae82921SPaul Mullowney //#include "petscbt.h" 10*9ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h" 11*9ae82921SPaul Mullowney #include "petsc-private/vecimpl.h" 12*9ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END 13*9ae82921SPaul Mullowney #undef VecType 14*9ae82921SPaul Mullowney #include "cusparsematimpl.h" 15*9ae82921SPaul Mullowney 16*9ae82921SPaul Mullowney 17*9ae82921SPaul Mullowney EXTERN_C_BEGIN 18*9ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 19*9ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 20*9ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 21*9ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 22*9ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 23*9ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 24*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat); 25*9ae82921SPaul Mullowney EXTERN_C_END 26*9ae82921SPaul Mullowney 27*9ae82921SPaul Mullowney EXTERN_C_BEGIN 28*9ae82921SPaul Mullowney #undef __FUNCT__ 29*9ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 30*9ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 31*9ae82921SPaul Mullowney { 32*9ae82921SPaul Mullowney PetscFunctionBegin; 33*9ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 34*9ae82921SPaul Mullowney PetscFunctionReturn(0); 35*9ae82921SPaul Mullowney } 36*9ae82921SPaul Mullowney EXTERN_C_END 37*9ae82921SPaul Mullowney 38*9ae82921SPaul Mullowney EXTERN_C_BEGIN 39*9ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 40*9ae82921SPaul Mullowney EXTERN_C_END 41*9ae82921SPaul Mullowney 42*9ae82921SPaul Mullowney 43*9ae82921SPaul Mullowney 44*9ae82921SPaul Mullowney /*MC 45*9ae82921SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 46*9ae82921SPaul Mullowney on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 47*9ae82921SPaul Mullowney 48*9ae82921SPaul Mullowney ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves 49*9ae82921SPaul Mullowney 50*9ae82921SPaul Mullowney Options Database Keys: 51*9ae82921SPaul Mullowney + -mat_mult_cusparse_storage_format <CSR> - (choose one of) CSR, ELL 52*9ae82921SPaul Mullowney + -mat_solve_cusparse_storage_format <CSR> - (choose one of) CSR, ELL 53*9ae82921SPaul Mullowney 54*9ae82921SPaul Mullowney Level: beginner 55*9ae82921SPaul Mullowney M*/ 56*9ae82921SPaul Mullowney 57*9ae82921SPaul Mullowney EXTERN_C_BEGIN 58*9ae82921SPaul Mullowney #undef __FUNCT__ 59*9ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 60*9ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 61*9ae82921SPaul Mullowney { 62*9ae82921SPaul Mullowney PetscErrorCode ierr; 63*9ae82921SPaul Mullowney 64*9ae82921SPaul Mullowney PetscFunctionBegin; 65*9ae82921SPaul Mullowney ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 66*9ae82921SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 67*9ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 68*9ae82921SPaul Mullowney ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 69*9ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 70*9ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 71*9ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 72*9ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 73*9ae82921SPaul Mullowney (*B)->factortype = ftype; 74*9ae82921SPaul Mullowney PetscFunctionReturn(0); 75*9ae82921SPaul Mullowney } 76*9ae82921SPaul Mullowney EXTERN_C_END 77*9ae82921SPaul Mullowney 78*9ae82921SPaul Mullowney #undef __FUNCT__ 79*9ae82921SPaul Mullowney #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatMult" 80*9ae82921SPaul Mullowney PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatMult(Mat A,const GPUStorageFormat format) 81*9ae82921SPaul Mullowney { 82*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 83*9ae82921SPaul Mullowney PetscFunctionBegin; 84*9ae82921SPaul Mullowney cusparseMat->format = format; 85*9ae82921SPaul Mullowney PetscFunctionReturn(0); 86*9ae82921SPaul Mullowney } 87*9ae82921SPaul Mullowney 88*9ae82921SPaul Mullowney #undef __FUNCT__ 89*9ae82921SPaul Mullowney #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve" 90*9ae82921SPaul Mullowney PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,const GPUStorageFormat format) 91*9ae82921SPaul Mullowney { 92*9ae82921SPaul Mullowney PetscErrorCode ierr; 93*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 94*9ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 95*9ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 96*9ae82921SPaul Mullowney PetscFunctionBegin; 97*9ae82921SPaul Mullowney cusparseTriFactors->format = format; 98*9ae82921SPaul Mullowney if (cusparseMatLo && cusparseMatUp) { 99*9ae82921SPaul Mullowney // If data exists already delete 100*9ae82921SPaul Mullowney if (cusparseMatLo) 101*9ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatLo; 102*9ae82921SPaul Mullowney if (cusparseMatUp) 103*9ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatUp; 104*9ae82921SPaul Mullowney 105*9ae82921SPaul Mullowney // key data was deleted so reset the flag. 106*9ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 107*9ae82921SPaul Mullowney 108*9ae82921SPaul Mullowney // rebuild matrix 109*9ae82921SPaul Mullowney ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr); 110*9ae82921SPaul Mullowney } 111*9ae82921SPaul Mullowney PetscFunctionReturn(0); 112*9ae82921SPaul Mullowney } 113*9ae82921SPaul Mullowney 114*9ae82921SPaul Mullowney 115*9ae82921SPaul Mullowney #undef __FUNCT__ 116*9ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 117*9ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 118*9ae82921SPaul Mullowney { 119*9ae82921SPaul Mullowney PetscErrorCode ierr; 120*9ae82921SPaul Mullowney PetscInt idx=0; 121*9ae82921SPaul Mullowney const char * formats[]={CSR,ELL}; 122*9ae82921SPaul Mullowney PetscBool flg; 123*9ae82921SPaul Mullowney PetscFunctionBegin; 124*9ae82921SPaul Mullowney ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr); 125*9ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 126*9ae82921SPaul Mullowney ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format", 127*9ae82921SPaul Mullowney "Set the storage format of (seq)aijcusparse gpu matrices for SpMV", 128*9ae82921SPaul Mullowney "None",formats,2,formats[0],&idx,&flg);CHKERRQ(ierr); 129*9ae82921SPaul Mullowney ierr=MatAIJCUSPARSESetGPUStorageFormatForMatMult(A,formats[idx]);CHKERRQ(ierr); 130*9ae82921SPaul Mullowney } 131*9ae82921SPaul Mullowney else { 132*9ae82921SPaul Mullowney ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format", 133*9ae82921SPaul Mullowney "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve", 134*9ae82921SPaul Mullowney "None",formats,2,formats[0],&idx,&flg);CHKERRQ(ierr); 135*9ae82921SPaul Mullowney ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr); 136*9ae82921SPaul Mullowney } 137*9ae82921SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 138*9ae82921SPaul Mullowney PetscFunctionReturn(0); 139*9ae82921SPaul Mullowney 140*9ae82921SPaul Mullowney } 141*9ae82921SPaul Mullowney 142*9ae82921SPaul Mullowney #undef __FUNCT__ 143*9ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 144*9ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 145*9ae82921SPaul Mullowney { 146*9ae82921SPaul Mullowney PetscErrorCode ierr; 147*9ae82921SPaul Mullowney 148*9ae82921SPaul Mullowney PetscFunctionBegin; 149*9ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 150*9ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 151*9ae82921SPaul Mullowney PetscFunctionReturn(0); 152*9ae82921SPaul Mullowney } 153*9ae82921SPaul Mullowney 154*9ae82921SPaul Mullowney #undef __FUNCT__ 155*9ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 156*9ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 157*9ae82921SPaul Mullowney { 158*9ae82921SPaul Mullowney PetscErrorCode ierr; 159*9ae82921SPaul Mullowney 160*9ae82921SPaul Mullowney PetscFunctionBegin; 161*9ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 162*9ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 163*9ae82921SPaul Mullowney PetscFunctionReturn(0); 164*9ae82921SPaul Mullowney } 165*9ae82921SPaul Mullowney 166*9ae82921SPaul Mullowney #undef __FUNCT__ 167*9ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix" 168*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A) 169*9ae82921SPaul Mullowney { 170*9ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 171*9ae82921SPaul Mullowney PetscInt n = A->rmap->n; 172*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 173*9ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 174*9ae82921SPaul Mullowney cusparseStatus_t stat; 175*9ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 176*9ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 177*9ae82921SPaul Mullowney PetscErrorCode ierr; 178*9ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 179*9ae82921SPaul Mullowney PetscScalar *AALo; 180*9ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 181*9ae82921SPaul Mullowney 182*9ae82921SPaul Mullowney PetscFunctionBegin; 183*9ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 184*9ae82921SPaul Mullowney try { 185*9ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 186*9ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 187*9ae82921SPaul Mullowney 188*9ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 189*9ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 190*9ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 191*9ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 192*9ae82921SPaul Mullowney 193*9ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 194*9ae82921SPaul Mullowney AiLo[0]=(PetscInt) 0; 195*9ae82921SPaul Mullowney AiLo[n]=nzLower; 196*9ae82921SPaul Mullowney AjLo[0]=(PetscInt) 0; 197*9ae82921SPaul Mullowney AALo[0]=(MatScalar) 1.0; 198*9ae82921SPaul Mullowney v = aa; 199*9ae82921SPaul Mullowney vi = aj; 200*9ae82921SPaul Mullowney offset=1; 201*9ae82921SPaul Mullowney rowOffset=1; 202*9ae82921SPaul Mullowney for (i=1; i<n; i++) { 203*9ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 204*9ae82921SPaul Mullowney // additional 1 for the term on the diagonal 205*9ae82921SPaul Mullowney AiLo[i]=rowOffset; 206*9ae82921SPaul Mullowney rowOffset+=nz+1; 207*9ae82921SPaul Mullowney 208*9ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 209*9ae82921SPaul Mullowney ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 210*9ae82921SPaul Mullowney 211*9ae82921SPaul Mullowney offset+=nz; 212*9ae82921SPaul Mullowney AjLo[offset]=(PetscInt) i; 213*9ae82921SPaul Mullowney AALo[offset]=(MatScalar) 1.0; 214*9ae82921SPaul Mullowney offset+=1; 215*9ae82921SPaul Mullowney 216*9ae82921SPaul Mullowney v += nz; 217*9ae82921SPaul Mullowney vi += nz; 218*9ae82921SPaul Mullowney } 219*9ae82921SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 220*9ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 221*9ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 222*9ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 223*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 224*9ae82921SPaul Mullowney // free temporary data 225*9ae82921SPaul Mullowney ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 226*9ae82921SPaul Mullowney ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 227*9ae82921SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 228*9ae82921SPaul Mullowney } catch(char* ex) { 229*9ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 230*9ae82921SPaul Mullowney } 231*9ae82921SPaul Mullowney } 232*9ae82921SPaul Mullowney PetscFunctionReturn(0); 233*9ae82921SPaul Mullowney } 234*9ae82921SPaul Mullowney 235*9ae82921SPaul Mullowney #undef __FUNCT__ 236*9ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix" 237*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A) 238*9ae82921SPaul Mullowney { 239*9ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 240*9ae82921SPaul Mullowney PetscInt n = A->rmap->n; 241*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 242*9ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 243*9ae82921SPaul Mullowney cusparseStatus_t stat; 244*9ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 245*9ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 246*9ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 247*9ae82921SPaul Mullowney PetscScalar *AAUp; 248*9ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 249*9ae82921SPaul Mullowney PetscErrorCode ierr; 250*9ae82921SPaul Mullowney 251*9ae82921SPaul Mullowney PetscFunctionBegin; 252*9ae82921SPaul Mullowney 253*9ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 254*9ae82921SPaul Mullowney try { 255*9ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 256*9ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 257*9ae82921SPaul Mullowney 258*9ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 259*9ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 260*9ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 261*9ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 262*9ae82921SPaul Mullowney 263*9ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 264*9ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 265*9ae82921SPaul Mullowney AiUp[n]=nzUpper; 266*9ae82921SPaul Mullowney offset = nzUpper; 267*9ae82921SPaul Mullowney for (i=n-1; i>=0; i--){ 268*9ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 269*9ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 270*9ae82921SPaul Mullowney 271*9ae82921SPaul Mullowney // number of elements NOT on the diagonal 272*9ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 273*9ae82921SPaul Mullowney 274*9ae82921SPaul Mullowney // decrement the offset 275*9ae82921SPaul Mullowney offset -= (nz+1); 276*9ae82921SPaul Mullowney 277*9ae82921SPaul Mullowney // first, set the diagonal elements 278*9ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 279*9ae82921SPaul Mullowney AAUp[offset] = 1./v[nz]; 280*9ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 281*9ae82921SPaul Mullowney 282*9ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 283*9ae82921SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 284*9ae82921SPaul Mullowney } 285*9ae82921SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 286*9ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 287*9ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 288*9ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 289*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 290*9ae82921SPaul Mullowney // free temporary data 291*9ae82921SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 292*9ae82921SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 293*9ae82921SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 294*9ae82921SPaul Mullowney } catch(char* ex) { 295*9ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 296*9ae82921SPaul Mullowney } 297*9ae82921SPaul Mullowney } 298*9ae82921SPaul Mullowney PetscFunctionReturn(0); 299*9ae82921SPaul Mullowney } 300*9ae82921SPaul Mullowney 301*9ae82921SPaul Mullowney #undef __FUNCT__ 302*9ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU" 303*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A) 304*9ae82921SPaul Mullowney { 305*9ae82921SPaul Mullowney PetscErrorCode ierr; 306*9ae82921SPaul Mullowney Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 307*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 308*9ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 309*9ae82921SPaul Mullowney PetscBool row_identity,col_identity; 310*9ae82921SPaul Mullowney const PetscInt *r,*c; 311*9ae82921SPaul Mullowney PetscInt n = A->rmap->n; 312*9ae82921SPaul Mullowney 313*9ae82921SPaul Mullowney PetscFunctionBegin; 314*9ae82921SPaul Mullowney ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 315*9ae82921SPaul Mullowney ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 316*9ae82921SPaul Mullowney cusparseTriFactors->tempvec = new CUSPARRAY; 317*9ae82921SPaul Mullowney cusparseTriFactors->tempvec->resize(n); 318*9ae82921SPaul Mullowney 319*9ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 320*9ae82921SPaul Mullowney //lower triangular indices 321*9ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 322*9ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 323*9ae82921SPaul Mullowney if (!row_identity) 324*9ae82921SPaul Mullowney ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 325*9ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 326*9ae82921SPaul Mullowney 327*9ae82921SPaul Mullowney //upper triangular indices 328*9ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 329*9ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 330*9ae82921SPaul Mullowney if (!col_identity) 331*9ae82921SPaul Mullowney ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 332*9ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 333*9ae82921SPaul Mullowney PetscFunctionReturn(0); 334*9ae82921SPaul Mullowney } 335*9ae82921SPaul Mullowney 336*9ae82921SPaul Mullowney #undef __FUNCT__ 337*9ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 338*9ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 339*9ae82921SPaul Mullowney { 340*9ae82921SPaul Mullowney PetscErrorCode ierr; 341*9ae82921SPaul Mullowney Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 342*9ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 343*9ae82921SPaul Mullowney PetscBool row_identity,col_identity; 344*9ae82921SPaul Mullowney 345*9ae82921SPaul Mullowney PetscFunctionBegin; 346*9ae82921SPaul Mullowney std::cout << "1 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl; 347*9ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 348*9ae82921SPaul Mullowney std::cout << "2 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl; 349*9ae82921SPaul Mullowney // determine which version of MatSolve needs to be used. 350*9ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 351*9ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 352*9ae82921SPaul Mullowney if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 353*9ae82921SPaul Mullowney else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 354*9ae82921SPaul Mullowney if (!row_identity) printf("Row permutations exist!"); 355*9ae82921SPaul Mullowney if (!col_identity) printf("Col permutations exist!"); 356*9ae82921SPaul Mullowney 357*9ae82921SPaul Mullowney // get the triangular factors 358*9ae82921SPaul Mullowney ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 359*9ae82921SPaul Mullowney std::cout << "3 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl; 360*9ae82921SPaul Mullowney PetscFunctionReturn(0); 361*9ae82921SPaul Mullowney } 362*9ae82921SPaul Mullowney 363*9ae82921SPaul Mullowney 364*9ae82921SPaul Mullowney 365*9ae82921SPaul Mullowney #undef __FUNCT__ 366*9ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 367*9ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 368*9ae82921SPaul Mullowney { 369*9ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 370*9ae82921SPaul Mullowney PetscErrorCode ierr; 371*9ae82921SPaul Mullowney CUSPARRAY *xGPU, *bGPU; 372*9ae82921SPaul Mullowney cusparseStatus_t stat; 373*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 374*9ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 375*9ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 376*9ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 377*9ae82921SPaul Mullowney 378*9ae82921SPaul Mullowney PetscFunctionBegin; 379*9ae82921SPaul Mullowney // Get the GPU pointers 380*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 381*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 382*9ae82921SPaul Mullowney 383*9ae82921SPaul Mullowney // solve with reordering 384*9ae82921SPaul Mullowney ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 385*9ae82921SPaul Mullowney stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 386*9ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 387*9ae82921SPaul Mullowney ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 388*9ae82921SPaul Mullowney 389*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 390*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 391*9ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 392*9ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 393*9ae82921SPaul Mullowney PetscFunctionReturn(0); 394*9ae82921SPaul Mullowney } 395*9ae82921SPaul Mullowney 396*9ae82921SPaul Mullowney 397*9ae82921SPaul Mullowney 398*9ae82921SPaul Mullowney 399*9ae82921SPaul Mullowney #undef __FUNCT__ 400*9ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 401*9ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 402*9ae82921SPaul Mullowney { 403*9ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 404*9ae82921SPaul Mullowney PetscErrorCode ierr; 405*9ae82921SPaul Mullowney CUSPARRAY *xGPU, *bGPU; 406*9ae82921SPaul Mullowney cusparseStatus_t stat; 407*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 408*9ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 409*9ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 410*9ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 411*9ae82921SPaul Mullowney 412*9ae82921SPaul Mullowney PetscFunctionBegin; 413*9ae82921SPaul Mullowney // Get the GPU pointers 414*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 415*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 416*9ae82921SPaul Mullowney 417*9ae82921SPaul Mullowney // solve 418*9ae82921SPaul Mullowney stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 419*9ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 420*9ae82921SPaul Mullowney 421*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 422*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 423*9ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 424*9ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 425*9ae82921SPaul Mullowney PetscFunctionReturn(0); 426*9ae82921SPaul Mullowney } 427*9ae82921SPaul Mullowney 428*9ae82921SPaul Mullowney #undef __FUNCT__ 429*9ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSECopyToGPU" 430*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSECopyToGPU(Mat A) 431*9ae82921SPaul Mullowney { 432*9ae82921SPaul Mullowney 433*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 434*9ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 435*9ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 436*9ae82921SPaul Mullowney PetscErrorCode ierr; 437*9ae82921SPaul Mullowney 438*9ae82921SPaul Mullowney 439*9ae82921SPaul Mullowney PetscFunctionBegin; 440*9ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 441*9ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 442*9ae82921SPaul Mullowney /* 443*9ae82921SPaul Mullowney It may be possible to reuse nonzero structure with new matrix values but 444*9ae82921SPaul Mullowney for simplicity and insured correctness we delete and build a new matrix on 445*9ae82921SPaul Mullowney the GPU. Likely a very small performance hit. 446*9ae82921SPaul Mullowney */ 447*9ae82921SPaul Mullowney if (cusparseMat->mat){ 448*9ae82921SPaul Mullowney try { 449*9ae82921SPaul Mullowney delete cusparseMat->mat; 450*9ae82921SPaul Mullowney if (cusparseMat->tempvec) 451*9ae82921SPaul Mullowney delete cusparseMat->tempvec; 452*9ae82921SPaul Mullowney 453*9ae82921SPaul Mullowney } catch(char* ex) { 454*9ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 455*9ae82921SPaul Mullowney } 456*9ae82921SPaul Mullowney } 457*9ae82921SPaul Mullowney try { 458*9ae82921SPaul Mullowney cusparseMat->nonzerorow=0; 459*9ae82921SPaul Mullowney for (int j = 0; j<m; j++) 460*9ae82921SPaul Mullowney cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 461*9ae82921SPaul Mullowney 462*9ae82921SPaul Mullowney if (a->compressedrow.use) { 463*9ae82921SPaul Mullowney m = a->compressedrow.nrows; 464*9ae82921SPaul Mullowney ii = a->compressedrow.i; 465*9ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 466*9ae82921SPaul Mullowney } else { 467*9ae82921SPaul Mullowney // Forcing compressed row on the GPU ... only relevant for CSR storage 468*9ae82921SPaul Mullowney int k=0; 469*9ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 470*9ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 471*9ae82921SPaul Mullowney ii[0]=0; 472*9ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 473*9ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 474*9ae82921SPaul Mullowney ii[k] = a->i[j]; 475*9ae82921SPaul Mullowney ridx[k]= j; 476*9ae82921SPaul Mullowney k++; 477*9ae82921SPaul Mullowney } 478*9ae82921SPaul Mullowney } 479*9ae82921SPaul Mullowney ii[cusparseMat->nonzerorow] = a->nz; 480*9ae82921SPaul Mullowney m = cusparseMat->nonzerorow; 481*9ae82921SPaul Mullowney } 482*9ae82921SPaul Mullowney 483*9ae82921SPaul Mullowney // Build our matrix ... first determine the GPU storage type 484*9ae82921SPaul Mullowney cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format); 485*9ae82921SPaul Mullowney 486*9ae82921SPaul Mullowney // FILL MODE UPPER is irrelevant 487*9ae82921SPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 488*9ae82921SPaul Mullowney 489*9ae82921SPaul Mullowney // Create the streams and events (if desired). 490*9ae82921SPaul Mullowney PetscMPIInt size; 491*9ae82921SPaul Mullowney ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 492*9ae82921SPaul Mullowney ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 493*9ae82921SPaul Mullowney 494*9ae82921SPaul Mullowney // lastly, build the matrix 495*9ae82921SPaul Mullowney ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 496*9ae82921SPaul Mullowney cusparseMat->mat->setCPRowIndices(ridx, m); 497*9ae82921SPaul Mullowney // } 498*9ae82921SPaul Mullowney if (!a->compressedrow.use) { 499*9ae82921SPaul Mullowney // free data 500*9ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 501*9ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 502*9ae82921SPaul Mullowney } 503*9ae82921SPaul Mullowney 504*9ae82921SPaul Mullowney cusparseMat->tempvec = new CUSPARRAY; 505*9ae82921SPaul Mullowney cusparseMat->tempvec->resize(m); 506*9ae82921SPaul Mullowney } catch(char* ex) { 507*9ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 508*9ae82921SPaul Mullowney } 509*9ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 510*9ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 511*9ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 512*9ae82921SPaul Mullowney } 513*9ae82921SPaul Mullowney PetscFunctionReturn(0); 514*9ae82921SPaul Mullowney } 515*9ae82921SPaul Mullowney 516*9ae82921SPaul Mullowney // #undef __FUNCT__ 517*9ae82921SPaul Mullowney // #define __FUNCT__ "MatCUSPARSECopyFromGPU" 518*9ae82921SPaul Mullowney // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu) 519*9ae82921SPaul Mullowney // { 520*9ae82921SPaul Mullowney // Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr; 521*9ae82921SPaul Mullowney // Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data; 522*9ae82921SPaul Mullowney // PetscInt m = A->rmap->n; 523*9ae82921SPaul Mullowney // PetscErrorCode ierr; 524*9ae82921SPaul Mullowney 525*9ae82921SPaul Mullowney // PetscFunctionBegin; 526*9ae82921SPaul Mullowney // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 527*9ae82921SPaul Mullowney // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 528*9ae82921SPaul Mullowney // try { 529*9ae82921SPaul Mullowney // cuspstruct->mat = Agpu; 530*9ae82921SPaul Mullowney // if (a->compressedrow.use) { 531*9ae82921SPaul Mullowney // //PetscInt *ii, *ridx; 532*9ae82921SPaul Mullowney // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices"); 533*9ae82921SPaul Mullowney // } else { 534*9ae82921SPaul Mullowney // PetscInt i; 535*9ae82921SPaul Mullowney 536*9ae82921SPaul Mullowney // if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);} 537*9ae82921SPaul Mullowney // a->nz = cuspstruct->mat->values.size(); 538*9ae82921SPaul Mullowney // a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 539*9ae82921SPaul Mullowney // A->preallocated = PETSC_TRUE; 540*9ae82921SPaul Mullowney // // Copy ai, aj, aa 541*9ae82921SPaul Mullowney // if (a->singlemalloc) { 542*9ae82921SPaul Mullowney // if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 543*9ae82921SPaul Mullowney // } else { 544*9ae82921SPaul Mullowney // if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 545*9ae82921SPaul Mullowney // if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 546*9ae82921SPaul Mullowney // if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 547*9ae82921SPaul Mullowney // } 548*9ae82921SPaul Mullowney // ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 549*9ae82921SPaul Mullowney // ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 550*9ae82921SPaul Mullowney // a->singlemalloc = PETSC_TRUE; 551*9ae82921SPaul Mullowney // thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i); 552*9ae82921SPaul Mullowney // thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j); 553*9ae82921SPaul Mullowney // thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a); 554*9ae82921SPaul Mullowney // // Setup row lengths 555*9ae82921SPaul Mullowney // if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 556*9ae82921SPaul Mullowney // ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 557*9ae82921SPaul Mullowney // ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 558*9ae82921SPaul Mullowney // for(i = 0; i < m; ++i) { 559*9ae82921SPaul Mullowney // a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; 560*9ae82921SPaul Mullowney // } 561*9ae82921SPaul Mullowney // // a->diag? 562*9ae82921SPaul Mullowney // } 563*9ae82921SPaul Mullowney // cuspstruct->tempvec = new CUSPARRAY; 564*9ae82921SPaul Mullowney // cuspstruct->tempvec->resize(m); 565*9ae82921SPaul Mullowney // } catch(char *ex) { 566*9ae82921SPaul Mullowney // SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex); 567*9ae82921SPaul Mullowney // } 568*9ae82921SPaul Mullowney // } 569*9ae82921SPaul Mullowney // // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying 570*9ae82921SPaul Mullowney // ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 571*9ae82921SPaul Mullowney // ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 572*9ae82921SPaul Mullowney // A->valid_GPU_matrix = PETSC_CUSP_BOTH; 573*9ae82921SPaul Mullowney // } else { 574*9ae82921SPaul Mullowney // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices"); 575*9ae82921SPaul Mullowney // } 576*9ae82921SPaul Mullowney // PetscFunctionReturn(0); 577*9ae82921SPaul Mullowney // } 578*9ae82921SPaul Mullowney 579*9ae82921SPaul Mullowney #undef __FUNCT__ 580*9ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 581*9ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 582*9ae82921SPaul Mullowney { 583*9ae82921SPaul Mullowney PetscErrorCode ierr; 584*9ae82921SPaul Mullowney 585*9ae82921SPaul Mullowney PetscFunctionBegin; 586*9ae82921SPaul Mullowney 587*9ae82921SPaul Mullowney if (right) { 588*9ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 589*9ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 590*9ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 591*9ae82921SPaul Mullowney ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 592*9ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 593*9ae82921SPaul Mullowney } 594*9ae82921SPaul Mullowney if (left) { 595*9ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 596*9ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 597*9ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 598*9ae82921SPaul Mullowney ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 599*9ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 600*9ae82921SPaul Mullowney } 601*9ae82921SPaul Mullowney PetscFunctionReturn(0); 602*9ae82921SPaul Mullowney } 603*9ae82921SPaul Mullowney 604*9ae82921SPaul Mullowney #undef __FUNCT__ 605*9ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 606*9ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 607*9ae82921SPaul Mullowney { 608*9ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 609*9ae82921SPaul Mullowney PetscErrorCode ierr; 610*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 611*9ae82921SPaul Mullowney CUSPARRAY *xarray,*yarray; 612*9ae82921SPaul Mullowney 613*9ae82921SPaul Mullowney PetscFunctionBegin; 614*9ae82921SPaul Mullowney // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 615*9ae82921SPaul Mullowney // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 616*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 617*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 618*9ae82921SPaul Mullowney try { 619*9ae82921SPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 620*9ae82921SPaul Mullowney } catch (char* ex) { 621*9ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 622*9ae82921SPaul Mullowney } 623*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 624*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 625*9ae82921SPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) 626*9ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 627*9ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 628*9ae82921SPaul Mullowney PetscFunctionReturn(0); 629*9ae82921SPaul Mullowney } 630*9ae82921SPaul Mullowney 631*9ae82921SPaul Mullowney 632*9ae82921SPaul Mullowney #undef __FUNCT__ 633*9ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 634*9ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 635*9ae82921SPaul Mullowney { 636*9ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637*9ae82921SPaul Mullowney PetscErrorCode ierr; 638*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 639*9ae82921SPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 640*9ae82921SPaul Mullowney PetscFunctionBegin; 641*9ae82921SPaul Mullowney // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 642*9ae82921SPaul Mullowney // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 643*9ae82921SPaul Mullowney try { 644*9ae82921SPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 645*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 646*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 647*9ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 648*9ae82921SPaul Mullowney 649*9ae82921SPaul Mullowney // multiply add 650*9ae82921SPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 651*9ae82921SPaul Mullowney 652*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 653*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 654*9ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 655*9ae82921SPaul Mullowney 656*9ae82921SPaul Mullowney } catch(char* ex) { 657*9ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 658*9ae82921SPaul Mullowney } 659*9ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 660*9ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 661*9ae82921SPaul Mullowney PetscFunctionReturn(0); 662*9ae82921SPaul Mullowney } 663*9ae82921SPaul Mullowney 664*9ae82921SPaul Mullowney #undef __FUNCT__ 665*9ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 666*9ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 667*9ae82921SPaul Mullowney { 668*9ae82921SPaul Mullowney PetscErrorCode ierr; 669*9ae82921SPaul Mullowney PetscFunctionBegin; 670*9ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 671*9ae82921SPaul Mullowney ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 672*9ae82921SPaul Mullowney if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 673*9ae82921SPaul Mullowney // this is not necessary since MatCUSPARSECopyToGPU has been called. 674*9ae82921SPaul Mullowney //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 675*9ae82921SPaul Mullowney // A->valid_GPU_matrix = PETSC_CUSP_CPU; 676*9ae82921SPaul Mullowney //} 677*9ae82921SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 678*9ae82921SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 679*9ae82921SPaul Mullowney PetscFunctionReturn(0); 680*9ae82921SPaul Mullowney } 681*9ae82921SPaul Mullowney 682*9ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 683*9ae82921SPaul Mullowney #undef __FUNCT__ 684*9ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 685*9ae82921SPaul Mullowney /*@C 686*9ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 687*9ae82921SPaul Mullowney (the default parallel PETSc format). For good matrix assembly performance 688*9ae82921SPaul Mullowney the user should preallocate the matrix storage by setting the parameter nz 689*9ae82921SPaul Mullowney (or the array nnz). By setting these parameters accurately, performance 690*9ae82921SPaul Mullowney during matrix assembly can be increased by more than a factor of 50. 691*9ae82921SPaul Mullowney 692*9ae82921SPaul Mullowney Collective on MPI_Comm 693*9ae82921SPaul Mullowney 694*9ae82921SPaul Mullowney Input Parameters: 695*9ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 696*9ae82921SPaul Mullowney . m - number of rows 697*9ae82921SPaul Mullowney . n - number of columns 698*9ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 699*9ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 700*9ae82921SPaul Mullowney (possibly different for each row) or PETSC_NULL 701*9ae82921SPaul Mullowney 702*9ae82921SPaul Mullowney Output Parameter: 703*9ae82921SPaul Mullowney . A - the matrix 704*9ae82921SPaul Mullowney 705*9ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 706*9ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 707*9ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 708*9ae82921SPaul Mullowney 709*9ae82921SPaul Mullowney Notes: 710*9ae82921SPaul Mullowney If nnz is given then nz is ignored 711*9ae82921SPaul Mullowney 712*9ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 713*9ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 714*9ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 715*9ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 716*9ae82921SPaul Mullowney 717*9ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 718*9ae82921SPaul Mullowney Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 719*9ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 720*9ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 721*9ae82921SPaul Mullowney 722*9ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 723*9ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 724*9ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 725*9ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 726*9ae82921SPaul Mullowney 727*9ae82921SPaul Mullowney Level: intermediate 728*9ae82921SPaul Mullowney 729*9ae82921SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 730*9ae82921SPaul Mullowney 731*9ae82921SPaul Mullowney @*/ 732*9ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 733*9ae82921SPaul Mullowney { 734*9ae82921SPaul Mullowney PetscErrorCode ierr; 735*9ae82921SPaul Mullowney 736*9ae82921SPaul Mullowney PetscFunctionBegin; 737*9ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 738*9ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 739*9ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 740*9ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 741*9ae82921SPaul Mullowney PetscFunctionReturn(0); 742*9ae82921SPaul Mullowney } 743*9ae82921SPaul Mullowney 744*9ae82921SPaul Mullowney #undef __FUNCT__ 745*9ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 746*9ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 747*9ae82921SPaul Mullowney { 748*9ae82921SPaul Mullowney PetscErrorCode ierr; 749*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 750*9ae82921SPaul Mullowney 751*9ae82921SPaul Mullowney PetscFunctionBegin; 752*9ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 753*9ae82921SPaul Mullowney // The regular matrices 754*9ae82921SPaul Mullowney try { 755*9ae82921SPaul Mullowney if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 756*9ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 757*9ae82921SPaul Mullowney } 758*9ae82921SPaul Mullowney if (cusparseMat->tempvec!=0) 759*9ae82921SPaul Mullowney delete cusparseMat->tempvec; 760*9ae82921SPaul Mullowney delete cusparseMat; 761*9ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 762*9ae82921SPaul Mullowney } catch(char* ex) { 763*9ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 764*9ae82921SPaul Mullowney } 765*9ae82921SPaul Mullowney } else { 766*9ae82921SPaul Mullowney // The triangular factors 767*9ae82921SPaul Mullowney try { 768*9ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 769*9ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 770*9ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 771*9ae82921SPaul Mullowney // destroy the matrix and the container 772*9ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatLo; 773*9ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatUp; 774*9ae82921SPaul Mullowney delete (CUSPARRAY*) cusparseTriFactors->tempvec; 775*9ae82921SPaul Mullowney delete cusparseTriFactors; 776*9ae82921SPaul Mullowney } catch(char* ex) { 777*9ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 778*9ae82921SPaul Mullowney } 779*9ae82921SPaul Mullowney } 780*9ae82921SPaul Mullowney if (MAT_cusparseHandle) { 781*9ae82921SPaul Mullowney cusparseStatus_t stat; 782*9ae82921SPaul Mullowney stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 783*9ae82921SPaul Mullowney MAT_cusparseHandle=0; 784*9ae82921SPaul Mullowney } 785*9ae82921SPaul Mullowney /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 786*9ae82921SPaul Mullowney A->spptr = 0; 787*9ae82921SPaul Mullowney 788*9ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 789*9ae82921SPaul Mullowney PetscFunctionReturn(0); 790*9ae82921SPaul Mullowney } 791*9ae82921SPaul Mullowney 792*9ae82921SPaul Mullowney 793*9ae82921SPaul Mullowney EXTERN_C_BEGIN 794*9ae82921SPaul Mullowney #undef __FUNCT__ 795*9ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 796*9ae82921SPaul Mullowney PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 797*9ae82921SPaul Mullowney { 798*9ae82921SPaul Mullowney PetscErrorCode ierr; 799*9ae82921SPaul Mullowney 800*9ae82921SPaul Mullowney PetscFunctionBegin; 801*9ae82921SPaul Mullowney ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 802*9ae82921SPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 803*9ae82921SPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 804*9ae82921SPaul Mullowney 805*9ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 806*9ae82921SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created.*/ 807*9ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 808*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 809*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 810*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = CSR; 811*9ae82921SPaul Mullowney } else { 812*9ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 813*9ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 814*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 815*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 816*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 817*9ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = CSR; 818*9ae82921SPaul Mullowney } 819*9ae82921SPaul Mullowney // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) 820*9ae82921SPaul Mullowney if (!MAT_cusparseHandle) { 821*9ae82921SPaul Mullowney cusparseStatus_t stat; 822*9ae82921SPaul Mullowney stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 823*9ae82921SPaul Mullowney } 824*9ae82921SPaul Mullowney // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 825*9ae82921SPaul Mullowney // default cusparse tri solve. Note the difference with the implementation in 826*9ae82921SPaul Mullowney // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu 827*9ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 828*9ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 829*9ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 830*9ae82921SPaul Mullowney B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 831*9ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 832*9ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 833*9ae82921SPaul Mullowney B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 834*9ae82921SPaul Mullowney PetscFunctionReturn(0); 835*9ae82921SPaul Mullowney } 836*9ae82921SPaul Mullowney EXTERN_C_END 837*9ae82921SPaul Mullowney 838