xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ca45077f9abf8e7344aac749d49eef6cfd88f621)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
99ae82921SPaul Mullowney //#include "petscbt.h"
109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
139ae82921SPaul Mullowney #undef VecType
149ae82921SPaul Mullowney #include "cusparsematimpl.h"
159ae82921SPaul Mullowney 
16*ca45077fSPaul Mullowney // this is such a hack ... but I don't know of another way to pass this variable
17*ca45077fSPaul Mullowney // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18*ca45077fSPaul Mullowney //  SpMV. Essentially, I need to use the same stream variable in two different
19*ca45077fSPaul Mullowney //  data structures. I do this by creating a single instance of that stream
20*ca45077fSPaul Mullowney //  and reuse it.
21*ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
229ae82921SPaul Mullowney 
239ae82921SPaul Mullowney EXTERN_C_BEGIN
249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
309ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat);
319ae82921SPaul Mullowney EXTERN_C_END
329ae82921SPaul Mullowney 
339ae82921SPaul Mullowney EXTERN_C_BEGIN
349ae82921SPaul Mullowney #undef __FUNCT__
359ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
369ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
379ae82921SPaul Mullowney {
389ae82921SPaul Mullowney   PetscFunctionBegin;
399ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
409ae82921SPaul Mullowney   PetscFunctionReturn(0);
419ae82921SPaul Mullowney }
429ae82921SPaul Mullowney EXTERN_C_END
439ae82921SPaul Mullowney 
449ae82921SPaul Mullowney EXTERN_C_BEGIN
459ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
469ae82921SPaul Mullowney EXTERN_C_END
479ae82921SPaul Mullowney 
489ae82921SPaul Mullowney 
499ae82921SPaul Mullowney 
509ae82921SPaul Mullowney /*MC
519ae82921SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
529ae82921SPaul Mullowney   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
539ae82921SPaul Mullowney 
549ae82921SPaul Mullowney   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves
559ae82921SPaul Mullowney 
569ae82921SPaul Mullowney   Options Database Keys:
57*ca45077fSPaul Mullowney + -mat_mult_cusparse_storage_format <CSR>         - (choose one of) CSR, ELL, HYB
58*ca45077fSPaul Mullowney + -mat_solve_cusparse_storage_format <CSR>        - (choose one of) CSR, ELL, HYB
599ae82921SPaul Mullowney 
609ae82921SPaul Mullowney    Level: beginner
619ae82921SPaul Mullowney M*/
629ae82921SPaul Mullowney 
639ae82921SPaul Mullowney EXTERN_C_BEGIN
649ae82921SPaul Mullowney #undef __FUNCT__
659ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
669ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
679ae82921SPaul Mullowney {
689ae82921SPaul Mullowney   PetscErrorCode     ierr;
699ae82921SPaul Mullowney 
709ae82921SPaul Mullowney   PetscFunctionBegin;
719ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
729ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
739ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
749ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
759ae82921SPaul Mullowney     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
769ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
779ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
789ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
799ae82921SPaul Mullowney   (*B)->factortype = ftype;
809ae82921SPaul Mullowney   PetscFunctionReturn(0);
819ae82921SPaul Mullowney }
829ae82921SPaul Mullowney EXTERN_C_END
839ae82921SPaul Mullowney 
849ae82921SPaul Mullowney #undef __FUNCT__
859ae82921SPaul Mullowney #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve"
86*ca45077fSPaul Mullowney PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format)
879ae82921SPaul Mullowney {
889ae82921SPaul Mullowney   PetscErrorCode     ierr;
899ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
909ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
919ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
929ae82921SPaul Mullowney   PetscFunctionBegin;
939ae82921SPaul Mullowney   cusparseTriFactors->format = format;
949ae82921SPaul Mullowney   if (cusparseMatLo && cusparseMatUp) {
959ae82921SPaul Mullowney     // If data exists already delete
969ae82921SPaul Mullowney     if (cusparseMatLo)
979ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatLo;
989ae82921SPaul Mullowney     if (cusparseMatUp)
999ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatUp;
1009ae82921SPaul Mullowney 
1019ae82921SPaul Mullowney     // key data was deleted so reset the flag.
1029ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1039ae82921SPaul Mullowney 
1049ae82921SPaul Mullowney     // rebuild matrix
1059ae82921SPaul Mullowney     ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr);
1069ae82921SPaul Mullowney   }
1079ae82921SPaul Mullowney   PetscFunctionReturn(0);
1089ae82921SPaul Mullowney }
1099ae82921SPaul Mullowney 
110*ca45077fSPaul Mullowney //EXTERN_C_BEGIN
111*ca45077fSPaul Mullowney #undef __FUNCT__
112*ca45077fSPaul Mullowney #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE"
113*ca45077fSPaul Mullowney PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool  flg)
114*ca45077fSPaul Mullowney {
115*ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
116*ca45077fSPaul Mullowney   PetscErrorCode ierr;
117*ca45077fSPaul Mullowney   PetscFunctionBegin;
118*ca45077fSPaul Mullowney   ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
119*ca45077fSPaul Mullowney   switch (op) {
120*ca45077fSPaul Mullowney   case DIAGBLOCK_MAT_CSR:
121*ca45077fSPaul Mullowney   case OFFDIAGBLOCK_MAT_CSR:
122*ca45077fSPaul Mullowney   case MAT_CSR:
123*ca45077fSPaul Mullowney     //std::cout << "MatSetOption_SeqAIJCUSPARSE : CSR" << std::endl;
124*ca45077fSPaul Mullowney     cusparseMat->format = CSR;
125*ca45077fSPaul Mullowney     break;
126*ca45077fSPaul Mullowney   case DIAGBLOCK_MAT_DIA:
127*ca45077fSPaul Mullowney   case OFFDIAGBLOCK_MAT_DIA:
128*ca45077fSPaul Mullowney   case MAT_DIA:
129*ca45077fSPaul Mullowney     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
130*ca45077fSPaul Mullowney   case DIAGBLOCK_MAT_ELL:
131*ca45077fSPaul Mullowney   case OFFDIAGBLOCK_MAT_ELL:
132*ca45077fSPaul Mullowney   case MAT_ELL:
133*ca45077fSPaul Mullowney     //std::cout << "MatSetOption_SeqAIJCUSPARSE : ELL" << std::endl;
134*ca45077fSPaul Mullowney     cusparseMat->format = ELL;
135*ca45077fSPaul Mullowney     break;
136*ca45077fSPaul Mullowney   case DIAGBLOCK_MAT_HYB:
137*ca45077fSPaul Mullowney   case OFFDIAGBLOCK_MAT_HYB:
138*ca45077fSPaul Mullowney   case MAT_HYB:
139*ca45077fSPaul Mullowney     //std::cout << "MatSetOption_SeqAIJCUSPARSE : HYB" << std::endl;
140*ca45077fSPaul Mullowney     cusparseMat->format = HYB;
141*ca45077fSPaul Mullowney     break;
142*ca45077fSPaul Mullowney   default:
143*ca45077fSPaul Mullowney     break;
144*ca45077fSPaul Mullowney   }
145*ca45077fSPaul Mullowney   PetscFunctionReturn(0);
146*ca45077fSPaul Mullowney }
147*ca45077fSPaul Mullowney //EXTERN_C_END
1489ae82921SPaul Mullowney 
149*ca45077fSPaul Mullowney //EXTERN_C_BEGIN
1509ae82921SPaul Mullowney #undef __FUNCT__
1519ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1529ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1539ae82921SPaul Mullowney {
1549ae82921SPaul Mullowney   PetscErrorCode     ierr;
1559ae82921SPaul Mullowney   PetscInt       idx=0;
156*ca45077fSPaul Mullowney   char * formats[] ={CSR,ELL,HYB};
157*ca45077fSPaul Mullowney   MatOption format;
1589ae82921SPaul Mullowney   PetscBool      flg;
1599ae82921SPaul Mullowney   PetscFunctionBegin;
1609ae82921SPaul Mullowney   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr);
1619ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
1629ae82921SPaul Mullowney     ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format",
1639ae82921SPaul Mullowney 			     "Set the storage format of (seq)aijcusparse gpu matrices for SpMV",
164*ca45077fSPaul Mullowney 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
165*ca45077fSPaul Mullowney     if (formats[idx] == CSR)
166*ca45077fSPaul Mullowney       format=MAT_CSR;
167*ca45077fSPaul Mullowney     else if (formats[idx] == ELL)
168*ca45077fSPaul Mullowney       format=MAT_ELL;
169*ca45077fSPaul Mullowney     else if (formats[idx] == HYB)
170*ca45077fSPaul Mullowney       format=MAT_HYB;
171*ca45077fSPaul Mullowney 
172*ca45077fSPaul Mullowney     ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr);
1739ae82921SPaul Mullowney   }
1749ae82921SPaul Mullowney   else {
1759ae82921SPaul Mullowney     ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format",
1769ae82921SPaul Mullowney 			     "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve",
177*ca45077fSPaul Mullowney 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
1789ae82921SPaul Mullowney     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr);
1799ae82921SPaul Mullowney   }
1809ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1819ae82921SPaul Mullowney   PetscFunctionReturn(0);
1829ae82921SPaul Mullowney 
1839ae82921SPaul Mullowney }
184*ca45077fSPaul Mullowney //EXTERN_C_END
1859ae82921SPaul Mullowney 
1869ae82921SPaul Mullowney #undef __FUNCT__
1879ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1889ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1899ae82921SPaul Mullowney {
1909ae82921SPaul Mullowney   PetscErrorCode     ierr;
1919ae82921SPaul Mullowney 
1929ae82921SPaul Mullowney   PetscFunctionBegin;
1939ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1949ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1959ae82921SPaul Mullowney   PetscFunctionReturn(0);
1969ae82921SPaul Mullowney }
1979ae82921SPaul Mullowney 
1989ae82921SPaul Mullowney #undef __FUNCT__
1999ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
2009ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2019ae82921SPaul Mullowney {
2029ae82921SPaul Mullowney   PetscErrorCode     ierr;
2039ae82921SPaul Mullowney 
2049ae82921SPaul Mullowney   PetscFunctionBegin;
2059ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2069ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2079ae82921SPaul Mullowney   PetscFunctionReturn(0);
2089ae82921SPaul Mullowney }
2099ae82921SPaul Mullowney 
2109ae82921SPaul Mullowney #undef __FUNCT__
2119ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix"
2129ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
2139ae82921SPaul Mullowney {
2149ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2159ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
2169ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2179ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2189ae82921SPaul Mullowney   cusparseStatus_t stat;
2199ae82921SPaul Mullowney   const PetscInt    *ai = a->i,*aj = a->j,*vi;
2209ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
2219ae82921SPaul Mullowney   PetscErrorCode     ierr;
2229ae82921SPaul Mullowney   PetscInt *AiLo, *AjLo;
2239ae82921SPaul Mullowney   PetscScalar *AALo;
2249ae82921SPaul Mullowney   PetscInt i,nz, nzLower, offset, rowOffset;
2259ae82921SPaul Mullowney 
2269ae82921SPaul Mullowney   PetscFunctionBegin;
2279ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
2289ae82921SPaul Mullowney     try {
2299ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2309ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2319ae82921SPaul Mullowney 
2329ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2339ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2349ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2359ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2369ae82921SPaul Mullowney 
2379ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2389ae82921SPaul Mullowney       AiLo[0]=(PetscInt) 0;
2399ae82921SPaul Mullowney       AiLo[n]=nzLower;
2409ae82921SPaul Mullowney       AjLo[0]=(PetscInt) 0;
2419ae82921SPaul Mullowney       AALo[0]=(MatScalar) 1.0;
2429ae82921SPaul Mullowney       v    = aa;
2439ae82921SPaul Mullowney       vi   = aj;
2449ae82921SPaul Mullowney       offset=1;
2459ae82921SPaul Mullowney       rowOffset=1;
2469ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2479ae82921SPaul Mullowney 	nz  = ai[i+1] - ai[i];
2489ae82921SPaul Mullowney 	// additional 1 for the term on the diagonal
2499ae82921SPaul Mullowney 	AiLo[i]=rowOffset;
2509ae82921SPaul Mullowney 	rowOffset+=nz+1;
2519ae82921SPaul Mullowney 
2529ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2539ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2549ae82921SPaul Mullowney 
2559ae82921SPaul Mullowney 	offset+=nz;
2569ae82921SPaul Mullowney 	AjLo[offset]=(PetscInt) i;
2579ae82921SPaul Mullowney 	AALo[offset]=(MatScalar) 1.0;
2589ae82921SPaul Mullowney 	offset+=1;
2599ae82921SPaul Mullowney 
2609ae82921SPaul Mullowney 	v  += nz;
2619ae82921SPaul Mullowney 	vi += nz;
2629ae82921SPaul Mullowney       }
2639ae82921SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
2649ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
2659ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2669ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2679ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2689ae82921SPaul Mullowney       // free temporary data
2699ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2709ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2719ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2729ae82921SPaul Mullowney     } catch(char* ex) {
2739ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2749ae82921SPaul Mullowney     }
2759ae82921SPaul Mullowney   }
2769ae82921SPaul Mullowney   PetscFunctionReturn(0);
2779ae82921SPaul Mullowney }
2789ae82921SPaul Mullowney 
2799ae82921SPaul Mullowney #undef __FUNCT__
2809ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix"
2819ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
2829ae82921SPaul Mullowney {
2839ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2849ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
2859ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2869ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
2879ae82921SPaul Mullowney   cusparseStatus_t stat;
2889ae82921SPaul Mullowney   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
2899ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
2909ae82921SPaul Mullowney   PetscInt *AiUp, *AjUp;
2919ae82921SPaul Mullowney   PetscScalar *AAUp;
2929ae82921SPaul Mullowney   PetscInt i,nz, nzUpper, offset;
2939ae82921SPaul Mullowney   PetscErrorCode     ierr;
2949ae82921SPaul Mullowney 
2959ae82921SPaul Mullowney   PetscFunctionBegin;
2969ae82921SPaul Mullowney 
2979ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
2989ae82921SPaul Mullowney     try {
2999ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3009ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3019ae82921SPaul Mullowney 
3029ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
3039ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
3049ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
3059ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3069ae82921SPaul Mullowney 
3079ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3089ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3099ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3109ae82921SPaul Mullowney       offset = nzUpper;
3119ae82921SPaul Mullowney       for (i=n-1; i>=0; i--){
3129ae82921SPaul Mullowney 	v   = aa + adiag[i+1] + 1;
3139ae82921SPaul Mullowney 	vi  = aj + adiag[i+1] + 1;
3149ae82921SPaul Mullowney 
3159ae82921SPaul Mullowney 	// number of elements NOT on the diagonal
3169ae82921SPaul Mullowney 	nz = adiag[i] - adiag[i+1]-1;
3179ae82921SPaul Mullowney 
3189ae82921SPaul Mullowney 	// decrement the offset
3199ae82921SPaul Mullowney 	offset -= (nz+1);
3209ae82921SPaul Mullowney 
3219ae82921SPaul Mullowney 	// first, set the diagonal elements
3229ae82921SPaul Mullowney 	AjUp[offset] = (PetscInt) i;
3239ae82921SPaul Mullowney 	AAUp[offset] = 1./v[nz];
3249ae82921SPaul Mullowney 	AiUp[i] = AiUp[i+1] - (nz+1);
3259ae82921SPaul Mullowney 
3269ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3279ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3289ae82921SPaul Mullowney       }
3299ae82921SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
3309ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
3319ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3329ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3339ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3349ae82921SPaul Mullowney       // free temporary data
3359ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3369ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3379ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3389ae82921SPaul Mullowney     } catch(char* ex) {
3399ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3409ae82921SPaul Mullowney     }
3419ae82921SPaul Mullowney   }
3429ae82921SPaul Mullowney   PetscFunctionReturn(0);
3439ae82921SPaul Mullowney }
3449ae82921SPaul Mullowney 
3459ae82921SPaul Mullowney #undef __FUNCT__
3469ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU"
3479ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
3489ae82921SPaul Mullowney {
3499ae82921SPaul Mullowney   PetscErrorCode     ierr;
3509ae82921SPaul Mullowney   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
3519ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3529ae82921SPaul Mullowney   IS               isrow = a->row,iscol = a->icol;
3539ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
3549ae82921SPaul Mullowney   const PetscInt   *r,*c;
3559ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
3569ae82921SPaul Mullowney 
3579ae82921SPaul Mullowney   PetscFunctionBegin;
3589ae82921SPaul Mullowney   ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
3599ae82921SPaul Mullowney   ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
3609ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3619ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3629ae82921SPaul Mullowney 
3639ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
3649ae82921SPaul Mullowney   //lower triangular indices
3659ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3669ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3679ae82921SPaul Mullowney   if (!row_identity)
3689ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3699ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3709ae82921SPaul Mullowney 
3719ae82921SPaul Mullowney   //upper triangular indices
3729ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
3739ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3749ae82921SPaul Mullowney   if (!col_identity)
3759ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
3769ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
3779ae82921SPaul Mullowney   PetscFunctionReturn(0);
3789ae82921SPaul Mullowney }
3799ae82921SPaul Mullowney 
3809ae82921SPaul Mullowney #undef __FUNCT__
3819ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
3829ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
3839ae82921SPaul Mullowney {
3849ae82921SPaul Mullowney   PetscErrorCode   ierr;
3859ae82921SPaul Mullowney   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
3869ae82921SPaul Mullowney   IS               isrow = b->row,iscol = b->col;
3879ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
3889ae82921SPaul Mullowney 
3899ae82921SPaul Mullowney   PetscFunctionBegin;
3909ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
3919ae82921SPaul Mullowney   // determine which version of MatSolve needs to be used.
3929ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3939ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3949ae82921SPaul Mullowney   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
3959ae82921SPaul Mullowney   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
396*ca45077fSPaul Mullowney   //if (!row_identity) printf("Row permutations exist!");
397*ca45077fSPaul Mullowney   //if (!col_identity) printf("Col permutations exist!");
3989ae82921SPaul Mullowney 
3999ae82921SPaul Mullowney   // get the triangular factors
4009ae82921SPaul Mullowney   ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
4019ae82921SPaul Mullowney   PetscFunctionReturn(0);
4029ae82921SPaul Mullowney }
4039ae82921SPaul Mullowney 
4049ae82921SPaul Mullowney 
4059ae82921SPaul Mullowney 
4069ae82921SPaul Mullowney #undef __FUNCT__
4079ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
4089ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
4099ae82921SPaul Mullowney {
4109ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4119ae82921SPaul Mullowney   PetscErrorCode ierr;
4129ae82921SPaul Mullowney   CUSPARRAY      *xGPU, *bGPU;
4139ae82921SPaul Mullowney   cusparseStatus_t stat;
4149ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4159ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
4169ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
4179ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
4189ae82921SPaul Mullowney 
4199ae82921SPaul Mullowney   PetscFunctionBegin;
4209ae82921SPaul Mullowney   // Get the GPU pointers
4219ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4229ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4239ae82921SPaul Mullowney 
4249ae82921SPaul Mullowney   // solve with reordering
4259ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
4269ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
4279ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4289ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
4299ae82921SPaul Mullowney 
4309ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4319ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4329ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4339ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4349ae82921SPaul Mullowney   PetscFunctionReturn(0);
4359ae82921SPaul Mullowney }
4369ae82921SPaul Mullowney 
4379ae82921SPaul Mullowney 
4389ae82921SPaul Mullowney 
4399ae82921SPaul Mullowney 
4409ae82921SPaul Mullowney #undef __FUNCT__
4419ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
4429ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
4439ae82921SPaul Mullowney {
4449ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4459ae82921SPaul Mullowney   PetscErrorCode    ierr;
4469ae82921SPaul Mullowney   CUSPARRAY         *xGPU, *bGPU;
4479ae82921SPaul Mullowney   cusparseStatus_t stat;
4489ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4499ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
4509ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
4519ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
4529ae82921SPaul Mullowney 
4539ae82921SPaul Mullowney   PetscFunctionBegin;
4549ae82921SPaul Mullowney   // Get the GPU pointers
4559ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4569ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4579ae82921SPaul Mullowney 
4589ae82921SPaul Mullowney   // solve
4599ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
4609ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4619ae82921SPaul Mullowney 
4629ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4639ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4649ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4659ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4669ae82921SPaul Mullowney   PetscFunctionReturn(0);
4679ae82921SPaul Mullowney }
4689ae82921SPaul Mullowney 
4699ae82921SPaul Mullowney #undef __FUNCT__
4709ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSECopyToGPU"
4719ae82921SPaul Mullowney PetscErrorCode MatCUSPARSECopyToGPU(Mat A)
4729ae82921SPaul Mullowney {
4739ae82921SPaul Mullowney 
4749ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
4759ae82921SPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
4769ae82921SPaul Mullowney   PetscInt        m           = A->rmap->n,*ii,*ridx;
4779ae82921SPaul Mullowney   PetscErrorCode  ierr;
4789ae82921SPaul Mullowney 
4799ae82921SPaul Mullowney 
4809ae82921SPaul Mullowney   PetscFunctionBegin;
4819ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
4829ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
4839ae82921SPaul Mullowney     /*
4849ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
4859ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
4869ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
4879ae82921SPaul Mullowney     */
4889ae82921SPaul Mullowney     if (cusparseMat->mat){
4899ae82921SPaul Mullowney       try {
4909ae82921SPaul Mullowney 	delete cusparseMat->mat;
4919ae82921SPaul Mullowney 	if (cusparseMat->tempvec)
4929ae82921SPaul Mullowney 	  delete cusparseMat->tempvec;
4939ae82921SPaul Mullowney 
4949ae82921SPaul Mullowney       } catch(char* ex) {
4959ae82921SPaul Mullowney 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4969ae82921SPaul Mullowney       }
4979ae82921SPaul Mullowney     }
4989ae82921SPaul Mullowney     try {
4999ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
5009ae82921SPaul Mullowney       for (int j = 0; j<m; j++)
5019ae82921SPaul Mullowney 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
5029ae82921SPaul Mullowney 
5039ae82921SPaul Mullowney       if (a->compressedrow.use) {
5049ae82921SPaul Mullowney 	m    = a->compressedrow.nrows;
5059ae82921SPaul Mullowney 	ii   = a->compressedrow.i;
5069ae82921SPaul Mullowney 	ridx = a->compressedrow.rindex;
5079ae82921SPaul Mullowney       } else {
5089ae82921SPaul Mullowney 	// Forcing compressed row on the GPU ... only relevant for CSR storage
5099ae82921SPaul Mullowney 	int k=0;
5109ae82921SPaul Mullowney 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
5119ae82921SPaul Mullowney 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
5129ae82921SPaul Mullowney 	ii[0]=0;
5139ae82921SPaul Mullowney 	for (int j = 0; j<m; j++) {
5149ae82921SPaul Mullowney 	  if ((a->i[j+1]-a->i[j])>0) {
5159ae82921SPaul Mullowney 	    ii[k] = a->i[j];
5169ae82921SPaul Mullowney 	    ridx[k]= j;
5179ae82921SPaul Mullowney 	    k++;
5189ae82921SPaul Mullowney 	  }
5199ae82921SPaul Mullowney 	}
5209ae82921SPaul Mullowney 	ii[cusparseMat->nonzerorow] = a->nz;
5219ae82921SPaul Mullowney 	m = cusparseMat->nonzerorow;
5229ae82921SPaul Mullowney       }
5239ae82921SPaul Mullowney 
5249ae82921SPaul Mullowney       // Build our matrix ... first determine the GPU storage type
5259ae82921SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format);
5269ae82921SPaul Mullowney 
5279ae82921SPaul Mullowney       // Create the streams and events (if desired).
5289ae82921SPaul Mullowney       PetscMPIInt    size;
5299ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
5309ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
5319ae82921SPaul Mullowney 
532*ca45077fSPaul Mullowney       // FILL MODE UPPER is irrelevant
533*ca45077fSPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
534*ca45077fSPaul Mullowney 
5359ae82921SPaul Mullowney       // lastly, build the matrix
5369ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
5379ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
5389ae82921SPaul Mullowney 	//      }
5399ae82921SPaul Mullowney       if (!a->compressedrow.use) {
5409ae82921SPaul Mullowney 	// free data
5419ae82921SPaul Mullowney 	ierr = PetscFree(ii);CHKERRQ(ierr);
5429ae82921SPaul Mullowney 	ierr = PetscFree(ridx);CHKERRQ(ierr);
5439ae82921SPaul Mullowney       }
5449ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
5459ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
546*ca45077fSPaul Mullowney       //A->spptr = cusparseMat;
5479ae82921SPaul Mullowney     } catch(char* ex) {
5489ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5499ae82921SPaul Mullowney     }
5509ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
5519ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
5529ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
5539ae82921SPaul Mullowney   }
5549ae82921SPaul Mullowney   PetscFunctionReturn(0);
5559ae82921SPaul Mullowney }
5569ae82921SPaul Mullowney 
5579ae82921SPaul Mullowney // #undef __FUNCT__
5589ae82921SPaul Mullowney // #define __FUNCT__ "MatCUSPARSECopyFromGPU"
5599ae82921SPaul Mullowney // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu)
5609ae82921SPaul Mullowney // {
5619ae82921SPaul Mullowney //   Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr;
5629ae82921SPaul Mullowney //   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
5639ae82921SPaul Mullowney //   PetscInt        m          = A->rmap->n;
5649ae82921SPaul Mullowney //   PetscErrorCode  ierr;
5659ae82921SPaul Mullowney 
5669ae82921SPaul Mullowney //   PetscFunctionBegin;
5679ae82921SPaul Mullowney //   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
5689ae82921SPaul Mullowney //     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
5699ae82921SPaul Mullowney //       try {
5709ae82921SPaul Mullowney //         cuspstruct->mat = Agpu;
5719ae82921SPaul Mullowney //         if (a->compressedrow.use) {
5729ae82921SPaul Mullowney //           //PetscInt *ii, *ridx;
5739ae82921SPaul Mullowney //           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
5749ae82921SPaul Mullowney //         } else {
5759ae82921SPaul Mullowney //           PetscInt i;
5769ae82921SPaul Mullowney 
5779ae82921SPaul Mullowney //           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
5789ae82921SPaul Mullowney //           a->nz    = cuspstruct->mat->values.size();
5799ae82921SPaul Mullowney //           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
5809ae82921SPaul Mullowney //           A->preallocated = PETSC_TRUE;
5819ae82921SPaul Mullowney //           // Copy ai, aj, aa
5829ae82921SPaul Mullowney //           if (a->singlemalloc) {
5839ae82921SPaul Mullowney //             if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
5849ae82921SPaul Mullowney //           } else {
5859ae82921SPaul Mullowney //             if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
5869ae82921SPaul Mullowney //             if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
5879ae82921SPaul Mullowney //             if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
5889ae82921SPaul Mullowney //           }
5899ae82921SPaul Mullowney //           ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
5909ae82921SPaul Mullowney //           ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
5919ae82921SPaul Mullowney //           a->singlemalloc = PETSC_TRUE;
5929ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
5939ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
5949ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
5959ae82921SPaul Mullowney //           // Setup row lengths
5969ae82921SPaul Mullowney //           if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
5979ae82921SPaul Mullowney //           ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
5989ae82921SPaul Mullowney //           ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
5999ae82921SPaul Mullowney //           for(i = 0; i < m; ++i) {
6009ae82921SPaul Mullowney //             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
6019ae82921SPaul Mullowney //           }
6029ae82921SPaul Mullowney //           // a->diag?
6039ae82921SPaul Mullowney //         }
6049ae82921SPaul Mullowney //         cuspstruct->tempvec = new CUSPARRAY;
6059ae82921SPaul Mullowney //         cuspstruct->tempvec->resize(m);
6069ae82921SPaul Mullowney //       } catch(char *ex) {
6079ae82921SPaul Mullowney //         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
6089ae82921SPaul Mullowney //       }
6099ae82921SPaul Mullowney //     }
6109ae82921SPaul Mullowney //     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
6119ae82921SPaul Mullowney //     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
6129ae82921SPaul Mullowney //     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
6139ae82921SPaul Mullowney //     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
6149ae82921SPaul Mullowney //   } else {
6159ae82921SPaul Mullowney //     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
6169ae82921SPaul Mullowney //   }
6179ae82921SPaul Mullowney //   PetscFunctionReturn(0);
6189ae82921SPaul Mullowney // }
6199ae82921SPaul Mullowney 
6209ae82921SPaul Mullowney #undef __FUNCT__
6219ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
6229ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
6239ae82921SPaul Mullowney {
6249ae82921SPaul Mullowney   PetscErrorCode ierr;
6259ae82921SPaul Mullowney 
6269ae82921SPaul Mullowney   PetscFunctionBegin;
6279ae82921SPaul Mullowney 
6289ae82921SPaul Mullowney   if (right) {
6299ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
6309ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6319ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
6329ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
6339ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
6349ae82921SPaul Mullowney   }
6359ae82921SPaul Mullowney   if (left) {
6369ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
6379ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6389ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
6399ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
6409ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
6419ae82921SPaul Mullowney   }
6429ae82921SPaul Mullowney   PetscFunctionReturn(0);
6439ae82921SPaul Mullowney }
6449ae82921SPaul Mullowney 
6459ae82921SPaul Mullowney #undef __FUNCT__
6469ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
6479ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
6489ae82921SPaul Mullowney {
6499ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
6509ae82921SPaul Mullowney   PetscErrorCode ierr;
6519ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
6529ae82921SPaul Mullowney   CUSPARRAY      *xarray,*yarray;
6539ae82921SPaul Mullowney 
6549ae82921SPaul Mullowney   PetscFunctionBegin;
6559ae82921SPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
6569ae82921SPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
6579ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
6589ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
6599ae82921SPaul Mullowney   try {
6609ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
6619ae82921SPaul Mullowney   } catch (char* ex) {
6629ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6639ae82921SPaul Mullowney   }
6649ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
6659ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
666*ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
6679ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
668*ca45077fSPaul Mullowney   }
6699ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
6709ae82921SPaul Mullowney   PetscFunctionReturn(0);
6719ae82921SPaul Mullowney }
6729ae82921SPaul Mullowney 
6739ae82921SPaul Mullowney 
6749ae82921SPaul Mullowney #undef __FUNCT__
675*ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
676*ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
677*ca45077fSPaul Mullowney {
678*ca45077fSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
679*ca45077fSPaul Mullowney   PetscErrorCode ierr;
680*ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
681*ca45077fSPaul Mullowney   CUSPARRAY      *xarray,*yarray;
682*ca45077fSPaul Mullowney 
683*ca45077fSPaul Mullowney   PetscFunctionBegin;
684*ca45077fSPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
685*ca45077fSPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
686*ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
687*ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
688*ca45077fSPaul Mullowney   try {
689*ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
690*ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
691*ca45077fSPaul Mullowney #else
692*ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
693*ca45077fSPaul Mullowney #endif
694*ca45077fSPaul Mullowney   } catch (char* ex) {
695*ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
696*ca45077fSPaul Mullowney   }
697*ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
698*ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
699*ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
700*ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
701*ca45077fSPaul Mullowney   }
702*ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
703*ca45077fSPaul Mullowney   PetscFunctionReturn(0);
704*ca45077fSPaul Mullowney }
705*ca45077fSPaul Mullowney 
706*ca45077fSPaul Mullowney #undef __FUNCT__
7079ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
7089ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
7099ae82921SPaul Mullowney {
7109ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
7119ae82921SPaul Mullowney   PetscErrorCode ierr;
7129ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
7139ae82921SPaul Mullowney   CUSPARRAY      *xarray,*yarray,*zarray;
7149ae82921SPaul Mullowney   PetscFunctionBegin;
7159ae82921SPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
7169ae82921SPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
7179ae82921SPaul Mullowney   try {
7189ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
7199ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
7209ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
7219ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
7229ae82921SPaul Mullowney 
7239ae82921SPaul Mullowney     // multiply add
7249ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
7259ae82921SPaul Mullowney 
7269ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
7279ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
7289ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
7299ae82921SPaul Mullowney 
7309ae82921SPaul Mullowney   } catch(char* ex) {
7319ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7329ae82921SPaul Mullowney   }
7339ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
7349ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
7359ae82921SPaul Mullowney   PetscFunctionReturn(0);
7369ae82921SPaul Mullowney }
7379ae82921SPaul Mullowney 
7389ae82921SPaul Mullowney #undef __FUNCT__
739*ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
740*ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
741*ca45077fSPaul Mullowney {
742*ca45077fSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
743*ca45077fSPaul Mullowney   PetscErrorCode ierr;
744*ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
745*ca45077fSPaul Mullowney   CUSPARRAY      *xarray,*yarray,*zarray;
746*ca45077fSPaul Mullowney   PetscFunctionBegin;
747*ca45077fSPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
748*ca45077fSPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
749*ca45077fSPaul Mullowney   try {
750*ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
751*ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
752*ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
753*ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
754*ca45077fSPaul Mullowney 
755*ca45077fSPaul Mullowney     // multiply add with matrix transpose
756*ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
757*ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
758*ca45077fSPaul Mullowney #else
759*ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
760*ca45077fSPaul Mullowney #endif
761*ca45077fSPaul Mullowney 
762*ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
763*ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
764*ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
765*ca45077fSPaul Mullowney 
766*ca45077fSPaul Mullowney   } catch(char* ex) {
767*ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
768*ca45077fSPaul Mullowney   }
769*ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
770*ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
771*ca45077fSPaul Mullowney   PetscFunctionReturn(0);
772*ca45077fSPaul Mullowney }
773*ca45077fSPaul Mullowney 
774*ca45077fSPaul Mullowney #undef __FUNCT__
7759ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
7769ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
7779ae82921SPaul Mullowney {
7789ae82921SPaul Mullowney   PetscErrorCode  ierr;
7799ae82921SPaul Mullowney   PetscFunctionBegin;
7809ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
7819ae82921SPaul Mullowney   ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
7829ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
7839ae82921SPaul Mullowney   // this is not necessary since MatCUSPARSECopyToGPU has been called.
7849ae82921SPaul Mullowney   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
7859ae82921SPaul Mullowney   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
7869ae82921SPaul Mullowney   //}
7879ae82921SPaul Mullowney   PetscFunctionReturn(0);
7889ae82921SPaul Mullowney }
7899ae82921SPaul Mullowney 
7909ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
7919ae82921SPaul Mullowney #undef __FUNCT__
7929ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
7939ae82921SPaul Mullowney /*@C
7949ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
7959ae82921SPaul Mullowney    (the default parallel PETSc format).  For good matrix assembly performance
7969ae82921SPaul Mullowney    the user should preallocate the matrix storage by setting the parameter nz
7979ae82921SPaul Mullowney    (or the array nnz).  By setting these parameters accurately, performance
7989ae82921SPaul Mullowney    during matrix assembly can be increased by more than a factor of 50.
7999ae82921SPaul Mullowney 
8009ae82921SPaul Mullowney    Collective on MPI_Comm
8019ae82921SPaul Mullowney 
8029ae82921SPaul Mullowney    Input Parameters:
8039ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
8049ae82921SPaul Mullowney .  m - number of rows
8059ae82921SPaul Mullowney .  n - number of columns
8069ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
8079ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
8089ae82921SPaul Mullowney          (possibly different for each row) or PETSC_NULL
8099ae82921SPaul Mullowney 
8109ae82921SPaul Mullowney    Output Parameter:
8119ae82921SPaul Mullowney .  A - the matrix
8129ae82921SPaul Mullowney 
8139ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
8149ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
8159ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
8169ae82921SPaul Mullowney 
8179ae82921SPaul Mullowney    Notes:
8189ae82921SPaul Mullowney    If nnz is given then nz is ignored
8199ae82921SPaul Mullowney 
8209ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
8219ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
8229ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
8239ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
8249ae82921SPaul Mullowney 
8259ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
8269ae82921SPaul Mullowney    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
8279ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
8289ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
8299ae82921SPaul Mullowney 
8309ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
8319ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
8329ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
8339ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
8349ae82921SPaul Mullowney 
8359ae82921SPaul Mullowney    Level: intermediate
8369ae82921SPaul Mullowney 
8379ae82921SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
8389ae82921SPaul Mullowney 
8399ae82921SPaul Mullowney @*/
8409ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
8419ae82921SPaul Mullowney {
8429ae82921SPaul Mullowney   PetscErrorCode ierr;
8439ae82921SPaul Mullowney 
8449ae82921SPaul Mullowney   PetscFunctionBegin;
8459ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
8469ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
8479ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
8489ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
8499ae82921SPaul Mullowney   PetscFunctionReturn(0);
8509ae82921SPaul Mullowney }
8519ae82921SPaul Mullowney 
8529ae82921SPaul Mullowney #undef __FUNCT__
8539ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
8549ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
8559ae82921SPaul Mullowney {
8569ae82921SPaul Mullowney   PetscErrorCode        ierr;
8579ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
8589ae82921SPaul Mullowney 
8599ae82921SPaul Mullowney   PetscFunctionBegin;
8609ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
8619ae82921SPaul Mullowney     // The regular matrices
8629ae82921SPaul Mullowney     try {
8639ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
8649ae82921SPaul Mullowney 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
8659ae82921SPaul Mullowney       }
8669ae82921SPaul Mullowney       if (cusparseMat->tempvec!=0)
8679ae82921SPaul Mullowney 	delete cusparseMat->tempvec;
8689ae82921SPaul Mullowney       delete cusparseMat;
8699ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
8709ae82921SPaul Mullowney     } catch(char* ex) {
8719ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
8729ae82921SPaul Mullowney     }
8739ae82921SPaul Mullowney   } else {
8749ae82921SPaul Mullowney     // The triangular factors
8759ae82921SPaul Mullowney     try {
8769ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
8779ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
8789ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
8799ae82921SPaul Mullowney       // destroy the matrix and the container
8809ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatLo;
8819ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatUp;
8829ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
8839ae82921SPaul Mullowney       delete cusparseTriFactors;
8849ae82921SPaul Mullowney     } catch(char* ex) {
8859ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
8869ae82921SPaul Mullowney     }
8879ae82921SPaul Mullowney   }
8889ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
8899ae82921SPaul Mullowney     cusparseStatus_t stat;
8909ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
8919ae82921SPaul Mullowney     MAT_cusparseHandle=0;
8929ae82921SPaul Mullowney   }
8939ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
8949ae82921SPaul Mullowney   A->spptr = 0;
8959ae82921SPaul Mullowney 
8969ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
8979ae82921SPaul Mullowney   PetscFunctionReturn(0);
8989ae82921SPaul Mullowney }
8999ae82921SPaul Mullowney 
9009ae82921SPaul Mullowney 
9019ae82921SPaul Mullowney EXTERN_C_BEGIN
9029ae82921SPaul Mullowney #undef __FUNCT__
9039ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
9049ae82921SPaul Mullowney PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
9059ae82921SPaul Mullowney {
9069ae82921SPaul Mullowney   PetscErrorCode ierr;
907*ca45077fSPaul Mullowney   GPUStorageFormat format = CSR;
9089ae82921SPaul Mullowney 
9099ae82921SPaul Mullowney   PetscFunctionBegin;
9109ae82921SPaul Mullowney   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
9119ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
9129ae82921SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.*/
913*ca45077fSPaul Mullowney     // now build a GPU matrix data structure
9149ae82921SPaul Mullowney     B->spptr        = new Mat_SeqAIJCUSPARSE;
9159ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
9169ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
917*ca45077fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format;
9189ae82921SPaul Mullowney   } else {
9199ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
9209ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
9219ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
9229ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
923*ca45077fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format;
9249ae82921SPaul Mullowney   }
9259ae82921SPaul Mullowney   // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...)
9269ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
9279ae82921SPaul Mullowney     cusparseStatus_t stat;
9289ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
9299ae82921SPaul Mullowney   }
9309ae82921SPaul Mullowney   // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
9319ae82921SPaul Mullowney   // default cusparse tri solve. Note the difference with the implementation in
9329ae82921SPaul Mullowney   // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu
9339ae82921SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
9349ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
9359ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
9369ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
9379ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
938*ca45077fSPaul Mullowney   B->ops->setoption        = MatSetOption_SeqAIJCUSPARSE;
939*ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
940*ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
941*ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
942*ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
9439ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
9449ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
9459ae82921SPaul Mullowney   PetscFunctionReturn(0);
9469ae82921SPaul Mullowney }
9479ae82921SPaul Mullowney EXTERN_C_END
9489ae82921SPaul Mullowney 
949