xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 8dc1d2a3a0b0219f67591c34b0f761797c23be35)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
99ae82921SPaul Mullowney //#include "petscbt.h"
109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
139ae82921SPaul Mullowney #undef VecType
149ae82921SPaul Mullowney #include "cusparsematimpl.h"
159ae82921SPaul Mullowney 
16ca45077fSPaul Mullowney // this is such a hack ... but I don't know of another way to pass this variable
17ca45077fSPaul Mullowney // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18ca45077fSPaul Mullowney //  SpMV. Essentially, I need to use the same stream variable in two different
19ca45077fSPaul Mullowney //  data structures. I do this by creating a single instance of that stream
20ca45077fSPaul Mullowney //  and reuse it.
21ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
229ae82921SPaul Mullowney 
239ae82921SPaul Mullowney EXTERN_C_BEGIN
249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
309ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat);
31*8dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32*8dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33*8dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34*8dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
359ae82921SPaul Mullowney EXTERN_C_END
369ae82921SPaul Mullowney 
379ae82921SPaul Mullowney EXTERN_C_BEGIN
389ae82921SPaul Mullowney #undef __FUNCT__
399ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
409ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
419ae82921SPaul Mullowney {
429ae82921SPaul Mullowney   PetscFunctionBegin;
439ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
449ae82921SPaul Mullowney   PetscFunctionReturn(0);
459ae82921SPaul Mullowney }
469ae82921SPaul Mullowney EXTERN_C_END
479ae82921SPaul Mullowney 
489ae82921SPaul Mullowney EXTERN_C_BEGIN
499ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
509ae82921SPaul Mullowney EXTERN_C_END
519ae82921SPaul Mullowney 
529ae82921SPaul Mullowney 
539ae82921SPaul Mullowney 
549ae82921SPaul Mullowney /*MC
559ae82921SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
569ae82921SPaul Mullowney   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
57*8dc1d2a3SPaul Mullowney 5~
589ae82921SPaul Mullowney   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves
599ae82921SPaul Mullowney 
609ae82921SPaul Mullowney   Options Database Keys:
61ca45077fSPaul Mullowney + -mat_mult_cusparse_storage_format <CSR>         - (choose one of) CSR, ELL, HYB
62ca45077fSPaul Mullowney + -mat_solve_cusparse_storage_format <CSR>        - (choose one of) CSR, ELL, HYB
639ae82921SPaul Mullowney 
649ae82921SPaul Mullowney    Level: beginner
659ae82921SPaul Mullowney M*/
669ae82921SPaul Mullowney 
679ae82921SPaul Mullowney EXTERN_C_BEGIN
689ae82921SPaul Mullowney #undef __FUNCT__
699ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
709ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
719ae82921SPaul Mullowney {
729ae82921SPaul Mullowney   PetscErrorCode     ierr;
739ae82921SPaul Mullowney 
749ae82921SPaul Mullowney   PetscFunctionBegin;
759ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
769ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
779ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
789ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
799ae82921SPaul Mullowney     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
809ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
819ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
829ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
839ae82921SPaul Mullowney   (*B)->factortype = ftype;
849ae82921SPaul Mullowney   PetscFunctionReturn(0);
859ae82921SPaul Mullowney }
869ae82921SPaul Mullowney EXTERN_C_END
879ae82921SPaul Mullowney 
889ae82921SPaul Mullowney #undef __FUNCT__
899ae82921SPaul Mullowney #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve"
90ca45077fSPaul Mullowney PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format)
919ae82921SPaul Mullowney {
929ae82921SPaul Mullowney   PetscErrorCode     ierr;
939ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
949ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
959ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
969ae82921SPaul Mullowney   PetscFunctionBegin;
979ae82921SPaul Mullowney   cusparseTriFactors->format = format;
989ae82921SPaul Mullowney   if (cusparseMatLo && cusparseMatUp) {
999ae82921SPaul Mullowney     // If data exists already delete
1009ae82921SPaul Mullowney     if (cusparseMatLo)
1019ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatLo;
1029ae82921SPaul Mullowney     if (cusparseMatUp)
1039ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatUp;
1049ae82921SPaul Mullowney 
1059ae82921SPaul Mullowney     // key data was deleted so reset the flag.
1069ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1079ae82921SPaul Mullowney 
1089ae82921SPaul Mullowney     // rebuild matrix
1099ae82921SPaul Mullowney     ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr);
1109ae82921SPaul Mullowney   }
1119ae82921SPaul Mullowney   PetscFunctionReturn(0);
1129ae82921SPaul Mullowney }
1139ae82921SPaul Mullowney 
114ca45077fSPaul Mullowney //EXTERN_C_BEGIN
115ca45077fSPaul Mullowney #undef __FUNCT__
116ca45077fSPaul Mullowney #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE"
117ca45077fSPaul Mullowney PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool  flg)
118ca45077fSPaul Mullowney {
119ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
120ca45077fSPaul Mullowney   PetscErrorCode ierr;
121ca45077fSPaul Mullowney   PetscFunctionBegin;
122ca45077fSPaul Mullowney   ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
123ca45077fSPaul Mullowney   switch (op) {
124bbf3fe20SPaul Mullowney   case MAT_DIAGBLOCK_CSR:
125bbf3fe20SPaul Mullowney   case MAT_OFFDIAGBLOCK_CSR:
126ca45077fSPaul Mullowney   case MAT_CSR:
127ca45077fSPaul Mullowney     cusparseMat->format = CSR;
128ca45077fSPaul Mullowney     break;
129bbf3fe20SPaul Mullowney   case MAT_DIAGBLOCK_DIA:
130bbf3fe20SPaul Mullowney   case MAT_OFFDIAGBLOCK_DIA:
131ca45077fSPaul Mullowney   case MAT_DIA:
132ca45077fSPaul Mullowney     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
133bbf3fe20SPaul Mullowney   case MAT_DIAGBLOCK_ELL:
134bbf3fe20SPaul Mullowney   case MAT_OFFDIAGBLOCK_ELL:
135ca45077fSPaul Mullowney   case MAT_ELL:
136ca45077fSPaul Mullowney     cusparseMat->format = ELL;
137ca45077fSPaul Mullowney     break;
138bbf3fe20SPaul Mullowney   case MAT_DIAGBLOCK_HYB:
139bbf3fe20SPaul Mullowney   case MAT_OFFDIAGBLOCK_HYB:
140ca45077fSPaul Mullowney   case MAT_HYB:
141ca45077fSPaul Mullowney     cusparseMat->format = HYB;
142ca45077fSPaul Mullowney     break;
143ca45077fSPaul Mullowney   default:
144ca45077fSPaul Mullowney     break;
145ca45077fSPaul Mullowney   }
146ca45077fSPaul Mullowney   PetscFunctionReturn(0);
147ca45077fSPaul Mullowney }
148ca45077fSPaul Mullowney //EXTERN_C_END
1499ae82921SPaul Mullowney 
150ca45077fSPaul Mullowney //EXTERN_C_BEGIN
1519ae82921SPaul Mullowney #undef __FUNCT__
1529ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1539ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1549ae82921SPaul Mullowney {
1559ae82921SPaul Mullowney   PetscErrorCode     ierr;
1569ae82921SPaul Mullowney   PetscInt       idx=0;
157ca45077fSPaul Mullowney   char * formats[] ={CSR,ELL,HYB};
158ca45077fSPaul Mullowney   MatOption format;
1599ae82921SPaul Mullowney   PetscBool      flg;
1609ae82921SPaul Mullowney   PetscFunctionBegin;
1619ae82921SPaul Mullowney   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr);
1629ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
1639ae82921SPaul Mullowney     ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format",
1649ae82921SPaul Mullowney 			     "Set the storage format of (seq)aijcusparse gpu matrices for SpMV",
165ca45077fSPaul Mullowney 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
166ca45077fSPaul Mullowney     if (formats[idx] == CSR)
167ca45077fSPaul Mullowney       format=MAT_CSR;
168ca45077fSPaul Mullowney     else if (formats[idx] == ELL)
169ca45077fSPaul Mullowney       format=MAT_ELL;
170ca45077fSPaul Mullowney     else if (formats[idx] == HYB)
171ca45077fSPaul Mullowney       format=MAT_HYB;
172ca45077fSPaul Mullowney 
173ca45077fSPaul Mullowney     ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr);
1749ae82921SPaul Mullowney   }
1759ae82921SPaul Mullowney   else {
1769ae82921SPaul Mullowney     ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format",
1779ae82921SPaul Mullowney 			     "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve",
178ca45077fSPaul Mullowney 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
1799ae82921SPaul Mullowney     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr);
1809ae82921SPaul Mullowney   }
1819ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1829ae82921SPaul Mullowney   PetscFunctionReturn(0);
1839ae82921SPaul Mullowney 
1849ae82921SPaul Mullowney }
185ca45077fSPaul Mullowney //EXTERN_C_END
1869ae82921SPaul Mullowney 
1879ae82921SPaul Mullowney #undef __FUNCT__
1889ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1899ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1909ae82921SPaul Mullowney {
1919ae82921SPaul Mullowney   PetscErrorCode     ierr;
1929ae82921SPaul Mullowney 
1939ae82921SPaul Mullowney   PetscFunctionBegin;
1949ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1959ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1969ae82921SPaul Mullowney   PetscFunctionReturn(0);
1979ae82921SPaul Mullowney }
1989ae82921SPaul Mullowney 
1999ae82921SPaul Mullowney #undef __FUNCT__
2009ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
2019ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2029ae82921SPaul Mullowney {
2039ae82921SPaul Mullowney   PetscErrorCode     ierr;
2049ae82921SPaul Mullowney 
2059ae82921SPaul Mullowney   PetscFunctionBegin;
2069ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2079ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2089ae82921SPaul Mullowney   PetscFunctionReturn(0);
2099ae82921SPaul Mullowney }
2109ae82921SPaul Mullowney 
2119ae82921SPaul Mullowney #undef __FUNCT__
2129ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix"
2139ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
2149ae82921SPaul Mullowney {
2159ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2169ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
2179ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2189ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2199ae82921SPaul Mullowney   cusparseStatus_t stat;
2209ae82921SPaul Mullowney   const PetscInt    *ai = a->i,*aj = a->j,*vi;
2219ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
2229ae82921SPaul Mullowney   PetscErrorCode     ierr;
2239ae82921SPaul Mullowney   PetscInt *AiLo, *AjLo;
2249ae82921SPaul Mullowney   PetscScalar *AALo;
2259ae82921SPaul Mullowney   PetscInt i,nz, nzLower, offset, rowOffset;
2269ae82921SPaul Mullowney 
2279ae82921SPaul Mullowney   PetscFunctionBegin;
2289ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
2299ae82921SPaul Mullowney     try {
2309ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2319ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2329ae82921SPaul Mullowney 
2339ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2349ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2359ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2369ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2379ae82921SPaul Mullowney 
2389ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2399ae82921SPaul Mullowney       AiLo[0]=(PetscInt) 0;
2409ae82921SPaul Mullowney       AiLo[n]=nzLower;
2419ae82921SPaul Mullowney       AjLo[0]=(PetscInt) 0;
2429ae82921SPaul Mullowney       AALo[0]=(MatScalar) 1.0;
2439ae82921SPaul Mullowney       v    = aa;
2449ae82921SPaul Mullowney       vi   = aj;
2459ae82921SPaul Mullowney       offset=1;
2469ae82921SPaul Mullowney       rowOffset=1;
2479ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2489ae82921SPaul Mullowney 	nz  = ai[i+1] - ai[i];
2499ae82921SPaul Mullowney 	// additional 1 for the term on the diagonal
2509ae82921SPaul Mullowney 	AiLo[i]=rowOffset;
2519ae82921SPaul Mullowney 	rowOffset+=nz+1;
2529ae82921SPaul Mullowney 
2539ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2549ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2559ae82921SPaul Mullowney 
2569ae82921SPaul Mullowney 	offset+=nz;
2579ae82921SPaul Mullowney 	AjLo[offset]=(PetscInt) i;
2589ae82921SPaul Mullowney 	AALo[offset]=(MatScalar) 1.0;
2599ae82921SPaul Mullowney 	offset+=1;
2609ae82921SPaul Mullowney 
2619ae82921SPaul Mullowney 	v  += nz;
2629ae82921SPaul Mullowney 	vi += nz;
2639ae82921SPaul Mullowney       }
2649ae82921SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
2659ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
2669ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2679ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2689ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2699ae82921SPaul Mullowney       // free temporary data
2709ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2719ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2729ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2739ae82921SPaul Mullowney     } catch(char* ex) {
2749ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2759ae82921SPaul Mullowney     }
2769ae82921SPaul Mullowney   }
2779ae82921SPaul Mullowney   PetscFunctionReturn(0);
2789ae82921SPaul Mullowney }
2799ae82921SPaul Mullowney 
2809ae82921SPaul Mullowney #undef __FUNCT__
2819ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix"
2829ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
2839ae82921SPaul Mullowney {
2849ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2859ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
2869ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2879ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
2889ae82921SPaul Mullowney   cusparseStatus_t stat;
2899ae82921SPaul Mullowney   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
2909ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
2919ae82921SPaul Mullowney   PetscInt *AiUp, *AjUp;
2929ae82921SPaul Mullowney   PetscScalar *AAUp;
2939ae82921SPaul Mullowney   PetscInt i,nz, nzUpper, offset;
2949ae82921SPaul Mullowney   PetscErrorCode     ierr;
2959ae82921SPaul Mullowney 
2969ae82921SPaul Mullowney   PetscFunctionBegin;
2979ae82921SPaul Mullowney 
2989ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
2999ae82921SPaul Mullowney     try {
3009ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3019ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3029ae82921SPaul Mullowney 
3039ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
3049ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
3059ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
3069ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3079ae82921SPaul Mullowney 
3089ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3099ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3109ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3119ae82921SPaul Mullowney       offset = nzUpper;
3129ae82921SPaul Mullowney       for (i=n-1; i>=0; i--){
3139ae82921SPaul Mullowney 	v   = aa + adiag[i+1] + 1;
3149ae82921SPaul Mullowney 	vi  = aj + adiag[i+1] + 1;
3159ae82921SPaul Mullowney 
3169ae82921SPaul Mullowney 	// number of elements NOT on the diagonal
3179ae82921SPaul Mullowney 	nz = adiag[i] - adiag[i+1]-1;
3189ae82921SPaul Mullowney 
3199ae82921SPaul Mullowney 	// decrement the offset
3209ae82921SPaul Mullowney 	offset -= (nz+1);
3219ae82921SPaul Mullowney 
3229ae82921SPaul Mullowney 	// first, set the diagonal elements
3239ae82921SPaul Mullowney 	AjUp[offset] = (PetscInt) i;
3249ae82921SPaul Mullowney 	AAUp[offset] = 1./v[nz];
3259ae82921SPaul Mullowney 	AiUp[i] = AiUp[i+1] - (nz+1);
3269ae82921SPaul Mullowney 
3279ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3289ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3299ae82921SPaul Mullowney       }
3309ae82921SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
3319ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
3329ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3339ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3349ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3359ae82921SPaul Mullowney       // free temporary data
3369ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3379ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3389ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3399ae82921SPaul Mullowney     } catch(char* ex) {
3409ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3419ae82921SPaul Mullowney     }
3429ae82921SPaul Mullowney   }
3439ae82921SPaul Mullowney   PetscFunctionReturn(0);
3449ae82921SPaul Mullowney }
3459ae82921SPaul Mullowney 
3469ae82921SPaul Mullowney #undef __FUNCT__
3479ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU"
3489ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
3499ae82921SPaul Mullowney {
3509ae82921SPaul Mullowney   PetscErrorCode     ierr;
3519ae82921SPaul Mullowney   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
3529ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3539ae82921SPaul Mullowney   IS               isrow = a->row,iscol = a->icol;
3549ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
3559ae82921SPaul Mullowney   const PetscInt   *r,*c;
3569ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
3579ae82921SPaul Mullowney 
3589ae82921SPaul Mullowney   PetscFunctionBegin;
3599ae82921SPaul Mullowney   ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
3609ae82921SPaul Mullowney   ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
3619ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3629ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3639ae82921SPaul Mullowney 
3649ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
3659ae82921SPaul Mullowney   //lower triangular indices
3669ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3679ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3689ae82921SPaul Mullowney   if (!row_identity)
3699ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3709ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3719ae82921SPaul Mullowney 
3729ae82921SPaul Mullowney   //upper triangular indices
3739ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
3749ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3759ae82921SPaul Mullowney   if (!col_identity)
3769ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
3779ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
3789ae82921SPaul Mullowney   PetscFunctionReturn(0);
3799ae82921SPaul Mullowney }
3809ae82921SPaul Mullowney 
3819ae82921SPaul Mullowney #undef __FUNCT__
3829ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
3839ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
3849ae82921SPaul Mullowney {
3859ae82921SPaul Mullowney   PetscErrorCode   ierr;
3869ae82921SPaul Mullowney   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
3879ae82921SPaul Mullowney   IS               isrow = b->row,iscol = b->col;
3889ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
3899ae82921SPaul Mullowney 
3909ae82921SPaul Mullowney   PetscFunctionBegin;
3919ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
3929ae82921SPaul Mullowney   // determine which version of MatSolve needs to be used.
3939ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3949ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3959ae82921SPaul Mullowney   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
3969ae82921SPaul Mullowney   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
397*8dc1d2a3SPaul Mullowney 
398*8dc1d2a3SPaul Mullowney   // Not sure why this is necessary but somehow the function pointers got reset
399*8dc1d2a3SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
400*8dc1d2a3SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
401*8dc1d2a3SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
402*8dc1d2a3SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
403*8dc1d2a3SPaul Mullowney 
404ca45077fSPaul Mullowney   //if (!row_identity) printf("Row permutations exist!");
405ca45077fSPaul Mullowney   //if (!col_identity) printf("Col permutations exist!");
4069ae82921SPaul Mullowney 
4079ae82921SPaul Mullowney   // get the triangular factors
4089ae82921SPaul Mullowney   ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
4099ae82921SPaul Mullowney   PetscFunctionReturn(0);
4109ae82921SPaul Mullowney }
4119ae82921SPaul Mullowney 
4129ae82921SPaul Mullowney 
4139ae82921SPaul Mullowney 
4149ae82921SPaul Mullowney #undef __FUNCT__
4159ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
4169ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
4179ae82921SPaul Mullowney {
4189ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4199ae82921SPaul Mullowney   PetscErrorCode ierr;
4209ae82921SPaul Mullowney   CUSPARRAY      *xGPU, *bGPU;
4219ae82921SPaul Mullowney   cusparseStatus_t stat;
4229ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4239ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
4249ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
4259ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
4269ae82921SPaul Mullowney 
4279ae82921SPaul Mullowney   PetscFunctionBegin;
4289ae82921SPaul Mullowney   // Get the GPU pointers
4299ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4309ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4319ae82921SPaul Mullowney 
4329ae82921SPaul Mullowney   // solve with reordering
4339ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
4349ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
4359ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4369ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
4379ae82921SPaul Mullowney 
4389ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4399ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4409ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4419ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4429ae82921SPaul Mullowney   PetscFunctionReturn(0);
4439ae82921SPaul Mullowney }
4449ae82921SPaul Mullowney 
4459ae82921SPaul Mullowney 
4469ae82921SPaul Mullowney 
4479ae82921SPaul Mullowney 
4489ae82921SPaul Mullowney #undef __FUNCT__
4499ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
4509ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
4519ae82921SPaul Mullowney {
4529ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4539ae82921SPaul Mullowney   PetscErrorCode    ierr;
4549ae82921SPaul Mullowney   CUSPARRAY         *xGPU, *bGPU;
4559ae82921SPaul Mullowney   cusparseStatus_t stat;
4569ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4579ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
4589ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
4599ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
4609ae82921SPaul Mullowney 
4619ae82921SPaul Mullowney   PetscFunctionBegin;
4629ae82921SPaul Mullowney   // Get the GPU pointers
4639ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4649ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4659ae82921SPaul Mullowney 
4669ae82921SPaul Mullowney   // solve
4679ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
4689ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4699ae82921SPaul Mullowney 
4709ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4719ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4729ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4739ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4749ae82921SPaul Mullowney   PetscFunctionReturn(0);
4759ae82921SPaul Mullowney }
4769ae82921SPaul Mullowney 
4779ae82921SPaul Mullowney #undef __FUNCT__
4789ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSECopyToGPU"
4799ae82921SPaul Mullowney PetscErrorCode MatCUSPARSECopyToGPU(Mat A)
4809ae82921SPaul Mullowney {
4819ae82921SPaul Mullowney 
4829ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
4839ae82921SPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
4849ae82921SPaul Mullowney   PetscInt        m           = A->rmap->n,*ii,*ridx;
4859ae82921SPaul Mullowney   PetscErrorCode  ierr;
4869ae82921SPaul Mullowney 
4879ae82921SPaul Mullowney 
4889ae82921SPaul Mullowney   PetscFunctionBegin;
4899ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
4909ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
4919ae82921SPaul Mullowney     /*
4929ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
4939ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
4949ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
4959ae82921SPaul Mullowney     */
4969ae82921SPaul Mullowney     if (cusparseMat->mat){
4979ae82921SPaul Mullowney       try {
4989ae82921SPaul Mullowney 	delete cusparseMat->mat;
4999ae82921SPaul Mullowney 	if (cusparseMat->tempvec)
5009ae82921SPaul Mullowney 	  delete cusparseMat->tempvec;
5019ae82921SPaul Mullowney 
5029ae82921SPaul Mullowney       } catch(char* ex) {
5039ae82921SPaul Mullowney 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5049ae82921SPaul Mullowney       }
5059ae82921SPaul Mullowney     }
5069ae82921SPaul Mullowney     try {
5079ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
5089ae82921SPaul Mullowney       for (int j = 0; j<m; j++)
5099ae82921SPaul Mullowney 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
5109ae82921SPaul Mullowney 
5119ae82921SPaul Mullowney       if (a->compressedrow.use) {
5129ae82921SPaul Mullowney 	m    = a->compressedrow.nrows;
5139ae82921SPaul Mullowney 	ii   = a->compressedrow.i;
5149ae82921SPaul Mullowney 	ridx = a->compressedrow.rindex;
5159ae82921SPaul Mullowney       } else {
5169ae82921SPaul Mullowney 	// Forcing compressed row on the GPU ... only relevant for CSR storage
5179ae82921SPaul Mullowney 	int k=0;
5189ae82921SPaul Mullowney 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
5199ae82921SPaul Mullowney 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
5209ae82921SPaul Mullowney 	ii[0]=0;
5219ae82921SPaul Mullowney 	for (int j = 0; j<m; j++) {
5229ae82921SPaul Mullowney 	  if ((a->i[j+1]-a->i[j])>0) {
5239ae82921SPaul Mullowney 	    ii[k] = a->i[j];
5249ae82921SPaul Mullowney 	    ridx[k]= j;
5259ae82921SPaul Mullowney 	    k++;
5269ae82921SPaul Mullowney 	  }
5279ae82921SPaul Mullowney 	}
5289ae82921SPaul Mullowney 	ii[cusparseMat->nonzerorow] = a->nz;
5299ae82921SPaul Mullowney 	m = cusparseMat->nonzerorow;
5309ae82921SPaul Mullowney       }
5319ae82921SPaul Mullowney 
5329ae82921SPaul Mullowney       // Build our matrix ... first determine the GPU storage type
5339ae82921SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format);
5349ae82921SPaul Mullowney 
5359ae82921SPaul Mullowney       // Create the streams and events (if desired).
5369ae82921SPaul Mullowney       PetscMPIInt    size;
5379ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
5389ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
5399ae82921SPaul Mullowney 
540ca45077fSPaul Mullowney       // FILL MODE UPPER is irrelevant
541ca45077fSPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
542ca45077fSPaul Mullowney 
5439ae82921SPaul Mullowney       // lastly, build the matrix
5449ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
5459ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
5469ae82921SPaul Mullowney 	//      }
5479ae82921SPaul Mullowney       if (!a->compressedrow.use) {
5489ae82921SPaul Mullowney 	// free data
5499ae82921SPaul Mullowney 	ierr = PetscFree(ii);CHKERRQ(ierr);
5509ae82921SPaul Mullowney 	ierr = PetscFree(ridx);CHKERRQ(ierr);
5519ae82921SPaul Mullowney       }
5529ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
5539ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
554ca45077fSPaul Mullowney       //A->spptr = cusparseMat;
5559ae82921SPaul Mullowney     } catch(char* ex) {
5569ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5579ae82921SPaul Mullowney     }
5589ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
5599ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
5609ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
5619ae82921SPaul Mullowney   }
5629ae82921SPaul Mullowney   PetscFunctionReturn(0);
5639ae82921SPaul Mullowney }
5649ae82921SPaul Mullowney 
5659ae82921SPaul Mullowney // #undef __FUNCT__
5669ae82921SPaul Mullowney // #define __FUNCT__ "MatCUSPARSECopyFromGPU"
5679ae82921SPaul Mullowney // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu)
5689ae82921SPaul Mullowney // {
5699ae82921SPaul Mullowney //   Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr;
5709ae82921SPaul Mullowney //   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
5719ae82921SPaul Mullowney //   PetscInt        m          = A->rmap->n;
5729ae82921SPaul Mullowney //   PetscErrorCode  ierr;
5739ae82921SPaul Mullowney 
5749ae82921SPaul Mullowney //   PetscFunctionBegin;
5759ae82921SPaul Mullowney //   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
5769ae82921SPaul Mullowney //     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
5779ae82921SPaul Mullowney //       try {
5789ae82921SPaul Mullowney //         cuspstruct->mat = Agpu;
5799ae82921SPaul Mullowney //         if (a->compressedrow.use) {
5809ae82921SPaul Mullowney //           //PetscInt *ii, *ridx;
5819ae82921SPaul Mullowney //           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
5829ae82921SPaul Mullowney //         } else {
5839ae82921SPaul Mullowney //           PetscInt i;
5849ae82921SPaul Mullowney 
5859ae82921SPaul Mullowney //           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
5869ae82921SPaul Mullowney //           a->nz    = cuspstruct->mat->values.size();
5879ae82921SPaul Mullowney //           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
5889ae82921SPaul Mullowney //           A->preallocated = PETSC_TRUE;
5899ae82921SPaul Mullowney //           // Copy ai, aj, aa
5909ae82921SPaul Mullowney //           if (a->singlemalloc) {
5919ae82921SPaul Mullowney //             if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
5929ae82921SPaul Mullowney //           } else {
5939ae82921SPaul Mullowney //             if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
5949ae82921SPaul Mullowney //             if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
5959ae82921SPaul Mullowney //             if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
5969ae82921SPaul Mullowney //           }
5979ae82921SPaul Mullowney //           ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
5989ae82921SPaul Mullowney //           ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
5999ae82921SPaul Mullowney //           a->singlemalloc = PETSC_TRUE;
6009ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
6019ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
6029ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
6039ae82921SPaul Mullowney //           // Setup row lengths
6049ae82921SPaul Mullowney //           if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
6059ae82921SPaul Mullowney //           ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
6069ae82921SPaul Mullowney //           ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
6079ae82921SPaul Mullowney //           for(i = 0; i < m; ++i) {
6089ae82921SPaul Mullowney //             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
6099ae82921SPaul Mullowney //           }
6109ae82921SPaul Mullowney //           // a->diag?
6119ae82921SPaul Mullowney //         }
6129ae82921SPaul Mullowney //         cuspstruct->tempvec = new CUSPARRAY;
6139ae82921SPaul Mullowney //         cuspstruct->tempvec->resize(m);
6149ae82921SPaul Mullowney //       } catch(char *ex) {
6159ae82921SPaul Mullowney //         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
6169ae82921SPaul Mullowney //       }
6179ae82921SPaul Mullowney //     }
6189ae82921SPaul Mullowney //     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
6199ae82921SPaul Mullowney //     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
6209ae82921SPaul Mullowney //     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
6219ae82921SPaul Mullowney //     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
6229ae82921SPaul Mullowney //   } else {
6239ae82921SPaul Mullowney //     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
6249ae82921SPaul Mullowney //   }
6259ae82921SPaul Mullowney //   PetscFunctionReturn(0);
6269ae82921SPaul Mullowney // }
6279ae82921SPaul Mullowney 
6289ae82921SPaul Mullowney #undef __FUNCT__
6299ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
6309ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
6319ae82921SPaul Mullowney {
6329ae82921SPaul Mullowney   PetscErrorCode ierr;
6339ae82921SPaul Mullowney 
6349ae82921SPaul Mullowney   PetscFunctionBegin;
6359ae82921SPaul Mullowney 
6369ae82921SPaul Mullowney   if (right) {
6379ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
6389ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6399ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
6409ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
6419ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
6429ae82921SPaul Mullowney   }
6439ae82921SPaul Mullowney   if (left) {
6449ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
6459ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6469ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
6479ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
6489ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
6499ae82921SPaul Mullowney   }
6509ae82921SPaul Mullowney   PetscFunctionReturn(0);
6519ae82921SPaul Mullowney }
6529ae82921SPaul Mullowney 
6539ae82921SPaul Mullowney #undef __FUNCT__
6549ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
6559ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
6569ae82921SPaul Mullowney {
6579ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
6589ae82921SPaul Mullowney   PetscErrorCode ierr;
6599ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
6609ae82921SPaul Mullowney   CUSPARRAY      *xarray,*yarray;
6619ae82921SPaul Mullowney 
6629ae82921SPaul Mullowney   PetscFunctionBegin;
6639ae82921SPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
6649ae82921SPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
6659ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
6669ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
6679ae82921SPaul Mullowney   try {
6689ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
6699ae82921SPaul Mullowney   } catch (char* ex) {
6709ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6719ae82921SPaul Mullowney   }
6729ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
6739ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
674ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
6759ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
676ca45077fSPaul Mullowney   }
6779ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
6789ae82921SPaul Mullowney   PetscFunctionReturn(0);
6799ae82921SPaul Mullowney }
6809ae82921SPaul Mullowney 
6819ae82921SPaul Mullowney 
6829ae82921SPaul Mullowney #undef __FUNCT__
683ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
684ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
685ca45077fSPaul Mullowney {
686ca45077fSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
687ca45077fSPaul Mullowney   PetscErrorCode ierr;
688ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
689ca45077fSPaul Mullowney   CUSPARRAY      *xarray,*yarray;
690ca45077fSPaul Mullowney 
691ca45077fSPaul Mullowney   PetscFunctionBegin;
692ca45077fSPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
693ca45077fSPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
694ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
695ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
696ca45077fSPaul Mullowney   try {
697ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
698ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
699ca45077fSPaul Mullowney #else
700ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
701ca45077fSPaul Mullowney #endif
702ca45077fSPaul Mullowney   } catch (char* ex) {
703ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
704ca45077fSPaul Mullowney   }
705ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
706ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
707ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
708ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
709ca45077fSPaul Mullowney   }
710ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
711ca45077fSPaul Mullowney   PetscFunctionReturn(0);
712ca45077fSPaul Mullowney }
713ca45077fSPaul Mullowney 
714ca45077fSPaul Mullowney #undef __FUNCT__
7159ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
7169ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
7179ae82921SPaul Mullowney {
7189ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
7199ae82921SPaul Mullowney   PetscErrorCode ierr;
7209ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
7219ae82921SPaul Mullowney   CUSPARRAY      *xarray,*yarray,*zarray;
7229ae82921SPaul Mullowney   PetscFunctionBegin;
7239ae82921SPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
7249ae82921SPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
7259ae82921SPaul Mullowney   try {
7269ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
7279ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
7289ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
7299ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
7309ae82921SPaul Mullowney 
7319ae82921SPaul Mullowney     // multiply add
7329ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
7339ae82921SPaul Mullowney 
7349ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
7359ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
7369ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
7379ae82921SPaul Mullowney 
7389ae82921SPaul Mullowney   } catch(char* ex) {
7399ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7409ae82921SPaul Mullowney   }
7419ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
7429ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
7439ae82921SPaul Mullowney   PetscFunctionReturn(0);
7449ae82921SPaul Mullowney }
7459ae82921SPaul Mullowney 
7469ae82921SPaul Mullowney #undef __FUNCT__
747ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
748ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
749ca45077fSPaul Mullowney {
750ca45077fSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
751ca45077fSPaul Mullowney   PetscErrorCode ierr;
752ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
753ca45077fSPaul Mullowney   CUSPARRAY      *xarray,*yarray,*zarray;
754ca45077fSPaul Mullowney   PetscFunctionBegin;
755ca45077fSPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
756ca45077fSPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
757ca45077fSPaul Mullowney   try {
758ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
759ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
760ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
761ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
762ca45077fSPaul Mullowney 
763ca45077fSPaul Mullowney     // multiply add with matrix transpose
764ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
765ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
766ca45077fSPaul Mullowney #else
767ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
768ca45077fSPaul Mullowney #endif
769ca45077fSPaul Mullowney 
770ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
771ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
772ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
773ca45077fSPaul Mullowney 
774ca45077fSPaul Mullowney   } catch(char* ex) {
775ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
776ca45077fSPaul Mullowney   }
777ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
778ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
779ca45077fSPaul Mullowney   PetscFunctionReturn(0);
780ca45077fSPaul Mullowney }
781ca45077fSPaul Mullowney 
782ca45077fSPaul Mullowney #undef __FUNCT__
7839ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
7849ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
7859ae82921SPaul Mullowney {
7869ae82921SPaul Mullowney   PetscErrorCode  ierr;
7879ae82921SPaul Mullowney   PetscFunctionBegin;
7889ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
7899ae82921SPaul Mullowney   ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
7909ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
7919ae82921SPaul Mullowney   // this is not necessary since MatCUSPARSECopyToGPU has been called.
7929ae82921SPaul Mullowney   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
7939ae82921SPaul Mullowney   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
7949ae82921SPaul Mullowney   //}
795bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
796bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
797bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
798bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
7999ae82921SPaul Mullowney   PetscFunctionReturn(0);
8009ae82921SPaul Mullowney }
8019ae82921SPaul Mullowney 
8029ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
8039ae82921SPaul Mullowney #undef __FUNCT__
8049ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
8059ae82921SPaul Mullowney /*@C
8069ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
8079ae82921SPaul Mullowney    (the default parallel PETSc format).  For good matrix assembly performance
8089ae82921SPaul Mullowney    the user should preallocate the matrix storage by setting the parameter nz
8099ae82921SPaul Mullowney    (or the array nnz).  By setting these parameters accurately, performance
8109ae82921SPaul Mullowney    during matrix assembly can be increased by more than a factor of 50.
8119ae82921SPaul Mullowney 
8129ae82921SPaul Mullowney    Collective on MPI_Comm
8139ae82921SPaul Mullowney 
8149ae82921SPaul Mullowney    Input Parameters:
8159ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
8169ae82921SPaul Mullowney .  m - number of rows
8179ae82921SPaul Mullowney .  n - number of columns
8189ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
8199ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
8209ae82921SPaul Mullowney          (possibly different for each row) or PETSC_NULL
8219ae82921SPaul Mullowney 
8229ae82921SPaul Mullowney    Output Parameter:
8239ae82921SPaul Mullowney .  A - the matrix
8249ae82921SPaul Mullowney 
8259ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
8269ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
8279ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
8289ae82921SPaul Mullowney 
8299ae82921SPaul Mullowney    Notes:
8309ae82921SPaul Mullowney    If nnz is given then nz is ignored
8319ae82921SPaul Mullowney 
8329ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
8339ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
8349ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
8359ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
8369ae82921SPaul Mullowney 
8379ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
8389ae82921SPaul Mullowney    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
8399ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
8409ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
8419ae82921SPaul Mullowney 
8429ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
8439ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
8449ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
8459ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
8469ae82921SPaul Mullowney 
8479ae82921SPaul Mullowney    Level: intermediate
8489ae82921SPaul Mullowney 
8499ae82921SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
8509ae82921SPaul Mullowney 
8519ae82921SPaul Mullowney @*/
8529ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
8539ae82921SPaul Mullowney {
8549ae82921SPaul Mullowney   PetscErrorCode ierr;
8559ae82921SPaul Mullowney 
8569ae82921SPaul Mullowney   PetscFunctionBegin;
8579ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
8589ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
8599ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
8609ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
8619ae82921SPaul Mullowney   PetscFunctionReturn(0);
8629ae82921SPaul Mullowney }
8639ae82921SPaul Mullowney 
8649ae82921SPaul Mullowney #undef __FUNCT__
8659ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
8669ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
8679ae82921SPaul Mullowney {
8689ae82921SPaul Mullowney   PetscErrorCode        ierr;
8699ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
8709ae82921SPaul Mullowney 
8719ae82921SPaul Mullowney   PetscFunctionBegin;
8729ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
8739ae82921SPaul Mullowney     // The regular matrices
8749ae82921SPaul Mullowney     try {
8759ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
8769ae82921SPaul Mullowney 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
8779ae82921SPaul Mullowney       }
8789ae82921SPaul Mullowney       if (cusparseMat->tempvec!=0)
8799ae82921SPaul Mullowney 	delete cusparseMat->tempvec;
8809ae82921SPaul Mullowney       delete cusparseMat;
8819ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
8829ae82921SPaul Mullowney     } catch(char* ex) {
8839ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
8849ae82921SPaul Mullowney     }
8859ae82921SPaul Mullowney   } else {
8869ae82921SPaul Mullowney     // The triangular factors
8879ae82921SPaul Mullowney     try {
8889ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
8899ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
8909ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
8919ae82921SPaul Mullowney       // destroy the matrix and the container
8929ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatLo;
8939ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatUp;
8949ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
8959ae82921SPaul Mullowney       delete cusparseTriFactors;
8969ae82921SPaul Mullowney     } catch(char* ex) {
8979ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
8989ae82921SPaul Mullowney     }
8999ae82921SPaul Mullowney   }
9009ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
9019ae82921SPaul Mullowney     cusparseStatus_t stat;
9029ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
9039ae82921SPaul Mullowney     MAT_cusparseHandle=0;
9049ae82921SPaul Mullowney   }
9059ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
9069ae82921SPaul Mullowney   A->spptr = 0;
9079ae82921SPaul Mullowney 
9089ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
9099ae82921SPaul Mullowney   PetscFunctionReturn(0);
9109ae82921SPaul Mullowney }
9119ae82921SPaul Mullowney 
9129ae82921SPaul Mullowney 
9139ae82921SPaul Mullowney EXTERN_C_BEGIN
9149ae82921SPaul Mullowney #undef __FUNCT__
9159ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
9169ae82921SPaul Mullowney PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
9179ae82921SPaul Mullowney {
9189ae82921SPaul Mullowney   PetscErrorCode ierr;
919ca45077fSPaul Mullowney   GPUStorageFormat format = CSR;
9209ae82921SPaul Mullowney 
9219ae82921SPaul Mullowney   PetscFunctionBegin;
9229ae82921SPaul Mullowney   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
9239ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
9249ae82921SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.*/
925ca45077fSPaul Mullowney     // now build a GPU matrix data structure
9269ae82921SPaul Mullowney     B->spptr        = new Mat_SeqAIJCUSPARSE;
9279ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
9289ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
929ca45077fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format;
9309ae82921SPaul Mullowney   } else {
9319ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
932debe9ee2SPaul Mullowney     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
9339ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
9349ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
9359ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
936ca45077fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format;
9379ae82921SPaul Mullowney   }
9389ae82921SPaul Mullowney   // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...)
9399ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
9409ae82921SPaul Mullowney     cusparseStatus_t stat;
9419ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
9429ae82921SPaul Mullowney   }
9439ae82921SPaul Mullowney   // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
9449ae82921SPaul Mullowney   // default cusparse tri solve. Note the difference with the implementation in
9459ae82921SPaul Mullowney   // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu
9469ae82921SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
9479ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
9489ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
9499ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
9509ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
951ca45077fSPaul Mullowney   B->ops->setoption        = MatSetOption_SeqAIJCUSPARSE;
952ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
953ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
954ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
955ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
9569ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
9579ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
9589ae82921SPaul Mullowney   PetscFunctionReturn(0);
9599ae82921SPaul Mullowney }
9609ae82921SPaul Mullowney EXTERN_C_END
9619ae82921SPaul Mullowney 
962