xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 9ae82921df069a58776bfe4da82b38e8ff7dd41c)
1*9ae82921SPaul Mullowney /*
2*9ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
3*9ae82921SPaul Mullowney   matrix storage format.
4*9ae82921SPaul Mullowney */
5*9ae82921SPaul Mullowney 
6*9ae82921SPaul Mullowney #include "petscconf.h"
7*9ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
8*9ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9*9ae82921SPaul Mullowney //#include "petscbt.h"
10*9ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
11*9ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
12*9ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
13*9ae82921SPaul Mullowney #undef VecType
14*9ae82921SPaul Mullowney #include "cusparsematimpl.h"
15*9ae82921SPaul Mullowney 
16*9ae82921SPaul Mullowney 
17*9ae82921SPaul Mullowney EXTERN_C_BEGIN
18*9ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
19*9ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
20*9ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
21*9ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
22*9ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
23*9ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
24*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat);
25*9ae82921SPaul Mullowney EXTERN_C_END
26*9ae82921SPaul Mullowney 
27*9ae82921SPaul Mullowney EXTERN_C_BEGIN
28*9ae82921SPaul Mullowney #undef __FUNCT__
29*9ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
30*9ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
31*9ae82921SPaul Mullowney {
32*9ae82921SPaul Mullowney   PetscFunctionBegin;
33*9ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
34*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
35*9ae82921SPaul Mullowney }
36*9ae82921SPaul Mullowney EXTERN_C_END
37*9ae82921SPaul Mullowney 
38*9ae82921SPaul Mullowney EXTERN_C_BEGIN
39*9ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
40*9ae82921SPaul Mullowney EXTERN_C_END
41*9ae82921SPaul Mullowney 
42*9ae82921SPaul Mullowney 
43*9ae82921SPaul Mullowney 
44*9ae82921SPaul Mullowney /*MC
45*9ae82921SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
46*9ae82921SPaul Mullowney   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
47*9ae82921SPaul Mullowney 
48*9ae82921SPaul Mullowney   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves
49*9ae82921SPaul Mullowney 
50*9ae82921SPaul Mullowney   Options Database Keys:
51*9ae82921SPaul Mullowney + -mat_mult_cusparse_storage_format <CSR>         - (choose one of) CSR, ELL
52*9ae82921SPaul Mullowney + -mat_solve_cusparse_storage_format <CSR>        - (choose one of) CSR, ELL
53*9ae82921SPaul Mullowney 
54*9ae82921SPaul Mullowney    Level: beginner
55*9ae82921SPaul Mullowney M*/
56*9ae82921SPaul Mullowney 
57*9ae82921SPaul Mullowney EXTERN_C_BEGIN
58*9ae82921SPaul Mullowney #undef __FUNCT__
59*9ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
60*9ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
61*9ae82921SPaul Mullowney {
62*9ae82921SPaul Mullowney   PetscErrorCode     ierr;
63*9ae82921SPaul Mullowney 
64*9ae82921SPaul Mullowney   PetscFunctionBegin;
65*9ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
66*9ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
67*9ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
68*9ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
69*9ae82921SPaul Mullowney     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
70*9ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
71*9ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
72*9ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
73*9ae82921SPaul Mullowney   (*B)->factortype = ftype;
74*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
75*9ae82921SPaul Mullowney }
76*9ae82921SPaul Mullowney EXTERN_C_END
77*9ae82921SPaul Mullowney 
78*9ae82921SPaul Mullowney #undef __FUNCT__
79*9ae82921SPaul Mullowney #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatMult"
80*9ae82921SPaul Mullowney PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatMult(Mat A,const GPUStorageFormat format)
81*9ae82921SPaul Mullowney {
82*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
83*9ae82921SPaul Mullowney   PetscFunctionBegin;
84*9ae82921SPaul Mullowney   cusparseMat->format = format;
85*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
86*9ae82921SPaul Mullowney }
87*9ae82921SPaul Mullowney 
88*9ae82921SPaul Mullowney #undef __FUNCT__
89*9ae82921SPaul Mullowney #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve"
90*9ae82921SPaul Mullowney PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,const GPUStorageFormat format)
91*9ae82921SPaul Mullowney {
92*9ae82921SPaul Mullowney   PetscErrorCode     ierr;
93*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
94*9ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
95*9ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
96*9ae82921SPaul Mullowney   PetscFunctionBegin;
97*9ae82921SPaul Mullowney   cusparseTriFactors->format = format;
98*9ae82921SPaul Mullowney   if (cusparseMatLo && cusparseMatUp) {
99*9ae82921SPaul Mullowney     // If data exists already delete
100*9ae82921SPaul Mullowney     if (cusparseMatLo)
101*9ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatLo;
102*9ae82921SPaul Mullowney     if (cusparseMatUp)
103*9ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatUp;
104*9ae82921SPaul Mullowney 
105*9ae82921SPaul Mullowney     // key data was deleted so reset the flag.
106*9ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
107*9ae82921SPaul Mullowney 
108*9ae82921SPaul Mullowney     // rebuild matrix
109*9ae82921SPaul Mullowney     ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr);
110*9ae82921SPaul Mullowney   }
111*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
112*9ae82921SPaul Mullowney }
113*9ae82921SPaul Mullowney 
114*9ae82921SPaul Mullowney 
115*9ae82921SPaul Mullowney #undef __FUNCT__
116*9ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
117*9ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
118*9ae82921SPaul Mullowney {
119*9ae82921SPaul Mullowney   PetscErrorCode     ierr;
120*9ae82921SPaul Mullowney   PetscInt       idx=0;
121*9ae82921SPaul Mullowney   const char * formats[]={CSR,ELL};
122*9ae82921SPaul Mullowney   PetscBool      flg;
123*9ae82921SPaul Mullowney   PetscFunctionBegin;
124*9ae82921SPaul Mullowney   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr);
125*9ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
126*9ae82921SPaul Mullowney     ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format",
127*9ae82921SPaul Mullowney 			     "Set the storage format of (seq)aijcusparse gpu matrices for SpMV",
128*9ae82921SPaul Mullowney 			     "None",formats,2,formats[0],&idx,&flg);CHKERRQ(ierr);
129*9ae82921SPaul Mullowney     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatMult(A,formats[idx]);CHKERRQ(ierr);
130*9ae82921SPaul Mullowney   }
131*9ae82921SPaul Mullowney   else {
132*9ae82921SPaul Mullowney     ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format",
133*9ae82921SPaul Mullowney 			     "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve",
134*9ae82921SPaul Mullowney 			     "None",formats,2,formats[0],&idx,&flg);CHKERRQ(ierr);
135*9ae82921SPaul Mullowney     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr);
136*9ae82921SPaul Mullowney   }
137*9ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
138*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
139*9ae82921SPaul Mullowney 
140*9ae82921SPaul Mullowney }
141*9ae82921SPaul Mullowney 
142*9ae82921SPaul Mullowney #undef __FUNCT__
143*9ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
144*9ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
145*9ae82921SPaul Mullowney {
146*9ae82921SPaul Mullowney   PetscErrorCode     ierr;
147*9ae82921SPaul Mullowney 
148*9ae82921SPaul Mullowney   PetscFunctionBegin;
149*9ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
150*9ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
151*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
152*9ae82921SPaul Mullowney }
153*9ae82921SPaul Mullowney 
154*9ae82921SPaul Mullowney #undef __FUNCT__
155*9ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
156*9ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
157*9ae82921SPaul Mullowney {
158*9ae82921SPaul Mullowney   PetscErrorCode     ierr;
159*9ae82921SPaul Mullowney 
160*9ae82921SPaul Mullowney   PetscFunctionBegin;
161*9ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
162*9ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
163*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
164*9ae82921SPaul Mullowney }
165*9ae82921SPaul Mullowney 
166*9ae82921SPaul Mullowney #undef __FUNCT__
167*9ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix"
168*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
169*9ae82921SPaul Mullowney {
170*9ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
171*9ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
172*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
173*9ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
174*9ae82921SPaul Mullowney   cusparseStatus_t stat;
175*9ae82921SPaul Mullowney   const PetscInt    *ai = a->i,*aj = a->j,*vi;
176*9ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
177*9ae82921SPaul Mullowney   PetscErrorCode     ierr;
178*9ae82921SPaul Mullowney   PetscInt *AiLo, *AjLo;
179*9ae82921SPaul Mullowney   PetscScalar *AALo;
180*9ae82921SPaul Mullowney   PetscInt i,nz, nzLower, offset, rowOffset;
181*9ae82921SPaul Mullowney 
182*9ae82921SPaul Mullowney   PetscFunctionBegin;
183*9ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
184*9ae82921SPaul Mullowney     try {
185*9ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
186*9ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
187*9ae82921SPaul Mullowney 
188*9ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
189*9ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
190*9ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
191*9ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
192*9ae82921SPaul Mullowney 
193*9ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
194*9ae82921SPaul Mullowney       AiLo[0]=(PetscInt) 0;
195*9ae82921SPaul Mullowney       AiLo[n]=nzLower;
196*9ae82921SPaul Mullowney       AjLo[0]=(PetscInt) 0;
197*9ae82921SPaul Mullowney       AALo[0]=(MatScalar) 1.0;
198*9ae82921SPaul Mullowney       v    = aa;
199*9ae82921SPaul Mullowney       vi   = aj;
200*9ae82921SPaul Mullowney       offset=1;
201*9ae82921SPaul Mullowney       rowOffset=1;
202*9ae82921SPaul Mullowney       for (i=1; i<n; i++) {
203*9ae82921SPaul Mullowney 	nz  = ai[i+1] - ai[i];
204*9ae82921SPaul Mullowney 	// additional 1 for the term on the diagonal
205*9ae82921SPaul Mullowney 	AiLo[i]=rowOffset;
206*9ae82921SPaul Mullowney 	rowOffset+=nz+1;
207*9ae82921SPaul Mullowney 
208*9ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
209*9ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
210*9ae82921SPaul Mullowney 
211*9ae82921SPaul Mullowney 	offset+=nz;
212*9ae82921SPaul Mullowney 	AjLo[offset]=(PetscInt) i;
213*9ae82921SPaul Mullowney 	AALo[offset]=(MatScalar) 1.0;
214*9ae82921SPaul Mullowney 	offset+=1;
215*9ae82921SPaul Mullowney 
216*9ae82921SPaul Mullowney 	v  += nz;
217*9ae82921SPaul Mullowney 	vi += nz;
218*9ae82921SPaul Mullowney       }
219*9ae82921SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
220*9ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
221*9ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
222*9ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
223*9ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
224*9ae82921SPaul Mullowney       // free temporary data
225*9ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
226*9ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
227*9ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
228*9ae82921SPaul Mullowney     } catch(char* ex) {
229*9ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
230*9ae82921SPaul Mullowney     }
231*9ae82921SPaul Mullowney   }
232*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
233*9ae82921SPaul Mullowney }
234*9ae82921SPaul Mullowney 
235*9ae82921SPaul Mullowney #undef __FUNCT__
236*9ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix"
237*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
238*9ae82921SPaul Mullowney {
239*9ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
240*9ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
241*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242*9ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
243*9ae82921SPaul Mullowney   cusparseStatus_t stat;
244*9ae82921SPaul Mullowney   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
245*9ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
246*9ae82921SPaul Mullowney   PetscInt *AiUp, *AjUp;
247*9ae82921SPaul Mullowney   PetscScalar *AAUp;
248*9ae82921SPaul Mullowney   PetscInt i,nz, nzUpper, offset;
249*9ae82921SPaul Mullowney   PetscErrorCode     ierr;
250*9ae82921SPaul Mullowney 
251*9ae82921SPaul Mullowney   PetscFunctionBegin;
252*9ae82921SPaul Mullowney 
253*9ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
254*9ae82921SPaul Mullowney     try {
255*9ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
256*9ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
257*9ae82921SPaul Mullowney 
258*9ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
259*9ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
260*9ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
261*9ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
262*9ae82921SPaul Mullowney 
263*9ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
264*9ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
265*9ae82921SPaul Mullowney       AiUp[n]=nzUpper;
266*9ae82921SPaul Mullowney       offset = nzUpper;
267*9ae82921SPaul Mullowney       for (i=n-1; i>=0; i--){
268*9ae82921SPaul Mullowney 	v   = aa + adiag[i+1] + 1;
269*9ae82921SPaul Mullowney 	vi  = aj + adiag[i+1] + 1;
270*9ae82921SPaul Mullowney 
271*9ae82921SPaul Mullowney 	// number of elements NOT on the diagonal
272*9ae82921SPaul Mullowney 	nz = adiag[i] - adiag[i+1]-1;
273*9ae82921SPaul Mullowney 
274*9ae82921SPaul Mullowney 	// decrement the offset
275*9ae82921SPaul Mullowney 	offset -= (nz+1);
276*9ae82921SPaul Mullowney 
277*9ae82921SPaul Mullowney 	// first, set the diagonal elements
278*9ae82921SPaul Mullowney 	AjUp[offset] = (PetscInt) i;
279*9ae82921SPaul Mullowney 	AAUp[offset] = 1./v[nz];
280*9ae82921SPaul Mullowney 	AiUp[i] = AiUp[i+1] - (nz+1);
281*9ae82921SPaul Mullowney 
282*9ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
283*9ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
284*9ae82921SPaul Mullowney       }
285*9ae82921SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
286*9ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
287*9ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
288*9ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
289*9ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
290*9ae82921SPaul Mullowney       // free temporary data
291*9ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
292*9ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
293*9ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
294*9ae82921SPaul Mullowney     } catch(char* ex) {
295*9ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
296*9ae82921SPaul Mullowney     }
297*9ae82921SPaul Mullowney   }
298*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
299*9ae82921SPaul Mullowney }
300*9ae82921SPaul Mullowney 
301*9ae82921SPaul Mullowney #undef __FUNCT__
302*9ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU"
303*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
304*9ae82921SPaul Mullowney {
305*9ae82921SPaul Mullowney   PetscErrorCode     ierr;
306*9ae82921SPaul Mullowney   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
307*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
308*9ae82921SPaul Mullowney   IS               isrow = a->row,iscol = a->icol;
309*9ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
310*9ae82921SPaul Mullowney   const PetscInt   *r,*c;
311*9ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
312*9ae82921SPaul Mullowney 
313*9ae82921SPaul Mullowney   PetscFunctionBegin;
314*9ae82921SPaul Mullowney   ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
315*9ae82921SPaul Mullowney   ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
316*9ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
317*9ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
318*9ae82921SPaul Mullowney 
319*9ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
320*9ae82921SPaul Mullowney   //lower triangular indices
321*9ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
322*9ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
323*9ae82921SPaul Mullowney   if (!row_identity)
324*9ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
325*9ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
326*9ae82921SPaul Mullowney 
327*9ae82921SPaul Mullowney   //upper triangular indices
328*9ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
329*9ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
330*9ae82921SPaul Mullowney   if (!col_identity)
331*9ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
332*9ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
333*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
334*9ae82921SPaul Mullowney }
335*9ae82921SPaul Mullowney 
336*9ae82921SPaul Mullowney #undef __FUNCT__
337*9ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
338*9ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
339*9ae82921SPaul Mullowney {
340*9ae82921SPaul Mullowney   PetscErrorCode   ierr;
341*9ae82921SPaul Mullowney   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
342*9ae82921SPaul Mullowney   IS               isrow = b->row,iscol = b->col;
343*9ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
344*9ae82921SPaul Mullowney 
345*9ae82921SPaul Mullowney   PetscFunctionBegin;
346*9ae82921SPaul Mullowney   std::cout << "1 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl;
347*9ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
348*9ae82921SPaul Mullowney   std::cout << "2 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl;
349*9ae82921SPaul Mullowney   // determine which version of MatSolve needs to be used.
350*9ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
351*9ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
352*9ae82921SPaul Mullowney   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
353*9ae82921SPaul Mullowney   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
354*9ae82921SPaul Mullowney   if (!row_identity) printf("Row permutations exist!");
355*9ae82921SPaul Mullowney   if (!col_identity) printf("Col permutations exist!");
356*9ae82921SPaul Mullowney 
357*9ae82921SPaul Mullowney   // get the triangular factors
358*9ae82921SPaul Mullowney   ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
359*9ae82921SPaul Mullowney   std::cout << "3 MatLUFactorNumeric_SeqAIJCUSPARSE"<<std::endl;
360*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
361*9ae82921SPaul Mullowney }
362*9ae82921SPaul Mullowney 
363*9ae82921SPaul Mullowney 
364*9ae82921SPaul Mullowney 
365*9ae82921SPaul Mullowney #undef __FUNCT__
366*9ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
367*9ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
368*9ae82921SPaul Mullowney {
369*9ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
370*9ae82921SPaul Mullowney   PetscErrorCode ierr;
371*9ae82921SPaul Mullowney   CUSPARRAY      *xGPU, *bGPU;
372*9ae82921SPaul Mullowney   cusparseStatus_t stat;
373*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
374*9ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
375*9ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
376*9ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
377*9ae82921SPaul Mullowney 
378*9ae82921SPaul Mullowney   PetscFunctionBegin;
379*9ae82921SPaul Mullowney   // Get the GPU pointers
380*9ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
381*9ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
382*9ae82921SPaul Mullowney 
383*9ae82921SPaul Mullowney   // solve with reordering
384*9ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
385*9ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
386*9ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
387*9ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
388*9ae82921SPaul Mullowney 
389*9ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
390*9ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
391*9ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
392*9ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
393*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
394*9ae82921SPaul Mullowney }
395*9ae82921SPaul Mullowney 
396*9ae82921SPaul Mullowney 
397*9ae82921SPaul Mullowney 
398*9ae82921SPaul Mullowney 
399*9ae82921SPaul Mullowney #undef __FUNCT__
400*9ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
401*9ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
402*9ae82921SPaul Mullowney {
403*9ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
404*9ae82921SPaul Mullowney   PetscErrorCode    ierr;
405*9ae82921SPaul Mullowney   CUSPARRAY         *xGPU, *bGPU;
406*9ae82921SPaul Mullowney   cusparseStatus_t stat;
407*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
408*9ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
409*9ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
410*9ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
411*9ae82921SPaul Mullowney 
412*9ae82921SPaul Mullowney   PetscFunctionBegin;
413*9ae82921SPaul Mullowney   // Get the GPU pointers
414*9ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
415*9ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
416*9ae82921SPaul Mullowney 
417*9ae82921SPaul Mullowney   // solve
418*9ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
419*9ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
420*9ae82921SPaul Mullowney 
421*9ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
422*9ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
423*9ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
424*9ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
425*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
426*9ae82921SPaul Mullowney }
427*9ae82921SPaul Mullowney 
428*9ae82921SPaul Mullowney #undef __FUNCT__
429*9ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSECopyToGPU"
430*9ae82921SPaul Mullowney PetscErrorCode MatCUSPARSECopyToGPU(Mat A)
431*9ae82921SPaul Mullowney {
432*9ae82921SPaul Mullowney 
433*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
434*9ae82921SPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
435*9ae82921SPaul Mullowney   PetscInt        m           = A->rmap->n,*ii,*ridx;
436*9ae82921SPaul Mullowney   PetscErrorCode  ierr;
437*9ae82921SPaul Mullowney 
438*9ae82921SPaul Mullowney 
439*9ae82921SPaul Mullowney   PetscFunctionBegin;
440*9ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
441*9ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
442*9ae82921SPaul Mullowney     /*
443*9ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
444*9ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
445*9ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
446*9ae82921SPaul Mullowney     */
447*9ae82921SPaul Mullowney     if (cusparseMat->mat){
448*9ae82921SPaul Mullowney       try {
449*9ae82921SPaul Mullowney 	delete cusparseMat->mat;
450*9ae82921SPaul Mullowney 	if (cusparseMat->tempvec)
451*9ae82921SPaul Mullowney 	  delete cusparseMat->tempvec;
452*9ae82921SPaul Mullowney 
453*9ae82921SPaul Mullowney       } catch(char* ex) {
454*9ae82921SPaul Mullowney 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
455*9ae82921SPaul Mullowney       }
456*9ae82921SPaul Mullowney     }
457*9ae82921SPaul Mullowney     try {
458*9ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
459*9ae82921SPaul Mullowney       for (int j = 0; j<m; j++)
460*9ae82921SPaul Mullowney 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
461*9ae82921SPaul Mullowney 
462*9ae82921SPaul Mullowney       if (a->compressedrow.use) {
463*9ae82921SPaul Mullowney 	m    = a->compressedrow.nrows;
464*9ae82921SPaul Mullowney 	ii   = a->compressedrow.i;
465*9ae82921SPaul Mullowney 	ridx = a->compressedrow.rindex;
466*9ae82921SPaul Mullowney       } else {
467*9ae82921SPaul Mullowney 	// Forcing compressed row on the GPU ... only relevant for CSR storage
468*9ae82921SPaul Mullowney 	int k=0;
469*9ae82921SPaul Mullowney 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
470*9ae82921SPaul Mullowney 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
471*9ae82921SPaul Mullowney 	ii[0]=0;
472*9ae82921SPaul Mullowney 	for (int j = 0; j<m; j++) {
473*9ae82921SPaul Mullowney 	  if ((a->i[j+1]-a->i[j])>0) {
474*9ae82921SPaul Mullowney 	    ii[k] = a->i[j];
475*9ae82921SPaul Mullowney 	    ridx[k]= j;
476*9ae82921SPaul Mullowney 	    k++;
477*9ae82921SPaul Mullowney 	  }
478*9ae82921SPaul Mullowney 	}
479*9ae82921SPaul Mullowney 	ii[cusparseMat->nonzerorow] = a->nz;
480*9ae82921SPaul Mullowney 	m = cusparseMat->nonzerorow;
481*9ae82921SPaul Mullowney       }
482*9ae82921SPaul Mullowney 
483*9ae82921SPaul Mullowney       // Build our matrix ... first determine the GPU storage type
484*9ae82921SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format);
485*9ae82921SPaul Mullowney 
486*9ae82921SPaul Mullowney       // FILL MODE UPPER is irrelevant
487*9ae82921SPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
488*9ae82921SPaul Mullowney 
489*9ae82921SPaul Mullowney       // Create the streams and events (if desired).
490*9ae82921SPaul Mullowney       PetscMPIInt    size;
491*9ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
492*9ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
493*9ae82921SPaul Mullowney 
494*9ae82921SPaul Mullowney       // lastly, build the matrix
495*9ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
496*9ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
497*9ae82921SPaul Mullowney 	//      }
498*9ae82921SPaul Mullowney       if (!a->compressedrow.use) {
499*9ae82921SPaul Mullowney 	// free data
500*9ae82921SPaul Mullowney 	ierr = PetscFree(ii);CHKERRQ(ierr);
501*9ae82921SPaul Mullowney 	ierr = PetscFree(ridx);CHKERRQ(ierr);
502*9ae82921SPaul Mullowney       }
503*9ae82921SPaul Mullowney 
504*9ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
505*9ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
506*9ae82921SPaul Mullowney     } catch(char* ex) {
507*9ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
508*9ae82921SPaul Mullowney     }
509*9ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
510*9ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
511*9ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
512*9ae82921SPaul Mullowney   }
513*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
514*9ae82921SPaul Mullowney }
515*9ae82921SPaul Mullowney 
516*9ae82921SPaul Mullowney // #undef __FUNCT__
517*9ae82921SPaul Mullowney // #define __FUNCT__ "MatCUSPARSECopyFromGPU"
518*9ae82921SPaul Mullowney // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu)
519*9ae82921SPaul Mullowney // {
520*9ae82921SPaul Mullowney //   Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr;
521*9ae82921SPaul Mullowney //   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
522*9ae82921SPaul Mullowney //   PetscInt        m          = A->rmap->n;
523*9ae82921SPaul Mullowney //   PetscErrorCode  ierr;
524*9ae82921SPaul Mullowney 
525*9ae82921SPaul Mullowney //   PetscFunctionBegin;
526*9ae82921SPaul Mullowney //   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
527*9ae82921SPaul Mullowney //     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
528*9ae82921SPaul Mullowney //       try {
529*9ae82921SPaul Mullowney //         cuspstruct->mat = Agpu;
530*9ae82921SPaul Mullowney //         if (a->compressedrow.use) {
531*9ae82921SPaul Mullowney //           //PetscInt *ii, *ridx;
532*9ae82921SPaul Mullowney //           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
533*9ae82921SPaul Mullowney //         } else {
534*9ae82921SPaul Mullowney //           PetscInt i;
535*9ae82921SPaul Mullowney 
536*9ae82921SPaul Mullowney //           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
537*9ae82921SPaul Mullowney //           a->nz    = cuspstruct->mat->values.size();
538*9ae82921SPaul Mullowney //           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
539*9ae82921SPaul Mullowney //           A->preallocated = PETSC_TRUE;
540*9ae82921SPaul Mullowney //           // Copy ai, aj, aa
541*9ae82921SPaul Mullowney //           if (a->singlemalloc) {
542*9ae82921SPaul Mullowney //             if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
543*9ae82921SPaul Mullowney //           } else {
544*9ae82921SPaul Mullowney //             if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
545*9ae82921SPaul Mullowney //             if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
546*9ae82921SPaul Mullowney //             if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
547*9ae82921SPaul Mullowney //           }
548*9ae82921SPaul Mullowney //           ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
549*9ae82921SPaul Mullowney //           ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
550*9ae82921SPaul Mullowney //           a->singlemalloc = PETSC_TRUE;
551*9ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
552*9ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
553*9ae82921SPaul Mullowney //           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
554*9ae82921SPaul Mullowney //           // Setup row lengths
555*9ae82921SPaul Mullowney //           if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
556*9ae82921SPaul Mullowney //           ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
557*9ae82921SPaul Mullowney //           ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
558*9ae82921SPaul Mullowney //           for(i = 0; i < m; ++i) {
559*9ae82921SPaul Mullowney //             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
560*9ae82921SPaul Mullowney //           }
561*9ae82921SPaul Mullowney //           // a->diag?
562*9ae82921SPaul Mullowney //         }
563*9ae82921SPaul Mullowney //         cuspstruct->tempvec = new CUSPARRAY;
564*9ae82921SPaul Mullowney //         cuspstruct->tempvec->resize(m);
565*9ae82921SPaul Mullowney //       } catch(char *ex) {
566*9ae82921SPaul Mullowney //         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
567*9ae82921SPaul Mullowney //       }
568*9ae82921SPaul Mullowney //     }
569*9ae82921SPaul Mullowney //     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
570*9ae82921SPaul Mullowney //     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
571*9ae82921SPaul Mullowney //     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
572*9ae82921SPaul Mullowney //     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
573*9ae82921SPaul Mullowney //   } else {
574*9ae82921SPaul Mullowney //     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
575*9ae82921SPaul Mullowney //   }
576*9ae82921SPaul Mullowney //   PetscFunctionReturn(0);
577*9ae82921SPaul Mullowney // }
578*9ae82921SPaul Mullowney 
579*9ae82921SPaul Mullowney #undef __FUNCT__
580*9ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
581*9ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
582*9ae82921SPaul Mullowney {
583*9ae82921SPaul Mullowney   PetscErrorCode ierr;
584*9ae82921SPaul Mullowney 
585*9ae82921SPaul Mullowney   PetscFunctionBegin;
586*9ae82921SPaul Mullowney 
587*9ae82921SPaul Mullowney   if (right) {
588*9ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
589*9ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
590*9ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
591*9ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
592*9ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
593*9ae82921SPaul Mullowney   }
594*9ae82921SPaul Mullowney   if (left) {
595*9ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
596*9ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
597*9ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
598*9ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
599*9ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
600*9ae82921SPaul Mullowney   }
601*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
602*9ae82921SPaul Mullowney }
603*9ae82921SPaul Mullowney 
604*9ae82921SPaul Mullowney #undef __FUNCT__
605*9ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
606*9ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
607*9ae82921SPaul Mullowney {
608*9ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
609*9ae82921SPaul Mullowney   PetscErrorCode ierr;
610*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
611*9ae82921SPaul Mullowney   CUSPARRAY      *xarray,*yarray;
612*9ae82921SPaul Mullowney 
613*9ae82921SPaul Mullowney   PetscFunctionBegin;
614*9ae82921SPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
615*9ae82921SPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
616*9ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
617*9ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
618*9ae82921SPaul Mullowney   try {
619*9ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
620*9ae82921SPaul Mullowney   } catch (char* ex) {
621*9ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
622*9ae82921SPaul Mullowney   }
623*9ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
624*9ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
625*9ae82921SPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream())
626*9ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
627*9ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
628*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
629*9ae82921SPaul Mullowney }
630*9ae82921SPaul Mullowney 
631*9ae82921SPaul Mullowney 
632*9ae82921SPaul Mullowney #undef __FUNCT__
633*9ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
634*9ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
635*9ae82921SPaul Mullowney {
636*9ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
637*9ae82921SPaul Mullowney   PetscErrorCode ierr;
638*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
639*9ae82921SPaul Mullowney   CUSPARRAY      *xarray,*yarray,*zarray;
640*9ae82921SPaul Mullowney   PetscFunctionBegin;
641*9ae82921SPaul Mullowney   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
642*9ae82921SPaul Mullowney   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
643*9ae82921SPaul Mullowney   try {
644*9ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
645*9ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
646*9ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
647*9ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
648*9ae82921SPaul Mullowney 
649*9ae82921SPaul Mullowney     // multiply add
650*9ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
651*9ae82921SPaul Mullowney 
652*9ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
653*9ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
654*9ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
655*9ae82921SPaul Mullowney 
656*9ae82921SPaul Mullowney   } catch(char* ex) {
657*9ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
658*9ae82921SPaul Mullowney   }
659*9ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
660*9ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
661*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
662*9ae82921SPaul Mullowney }
663*9ae82921SPaul Mullowney 
664*9ae82921SPaul Mullowney #undef __FUNCT__
665*9ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
666*9ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
667*9ae82921SPaul Mullowney {
668*9ae82921SPaul Mullowney   PetscErrorCode  ierr;
669*9ae82921SPaul Mullowney   PetscFunctionBegin;
670*9ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
671*9ae82921SPaul Mullowney   ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
672*9ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
673*9ae82921SPaul Mullowney   // this is not necessary since MatCUSPARSECopyToGPU has been called.
674*9ae82921SPaul Mullowney   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
675*9ae82921SPaul Mullowney   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
676*9ae82921SPaul Mullowney   //}
677*9ae82921SPaul Mullowney   A->ops->mult    = MatMult_SeqAIJCUSPARSE;
678*9ae82921SPaul Mullowney   A->ops->multadd    = MatMultAdd_SeqAIJCUSPARSE;
679*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
680*9ae82921SPaul Mullowney }
681*9ae82921SPaul Mullowney 
682*9ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
683*9ae82921SPaul Mullowney #undef __FUNCT__
684*9ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
685*9ae82921SPaul Mullowney /*@C
686*9ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
687*9ae82921SPaul Mullowney    (the default parallel PETSc format).  For good matrix assembly performance
688*9ae82921SPaul Mullowney    the user should preallocate the matrix storage by setting the parameter nz
689*9ae82921SPaul Mullowney    (or the array nnz).  By setting these parameters accurately, performance
690*9ae82921SPaul Mullowney    during matrix assembly can be increased by more than a factor of 50.
691*9ae82921SPaul Mullowney 
692*9ae82921SPaul Mullowney    Collective on MPI_Comm
693*9ae82921SPaul Mullowney 
694*9ae82921SPaul Mullowney    Input Parameters:
695*9ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
696*9ae82921SPaul Mullowney .  m - number of rows
697*9ae82921SPaul Mullowney .  n - number of columns
698*9ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
699*9ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
700*9ae82921SPaul Mullowney          (possibly different for each row) or PETSC_NULL
701*9ae82921SPaul Mullowney 
702*9ae82921SPaul Mullowney    Output Parameter:
703*9ae82921SPaul Mullowney .  A - the matrix
704*9ae82921SPaul Mullowney 
705*9ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
706*9ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
707*9ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
708*9ae82921SPaul Mullowney 
709*9ae82921SPaul Mullowney    Notes:
710*9ae82921SPaul Mullowney    If nnz is given then nz is ignored
711*9ae82921SPaul Mullowney 
712*9ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
713*9ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
714*9ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
715*9ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
716*9ae82921SPaul Mullowney 
717*9ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
718*9ae82921SPaul Mullowney    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
719*9ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
720*9ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
721*9ae82921SPaul Mullowney 
722*9ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
723*9ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
724*9ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
725*9ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
726*9ae82921SPaul Mullowney 
727*9ae82921SPaul Mullowney    Level: intermediate
728*9ae82921SPaul Mullowney 
729*9ae82921SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
730*9ae82921SPaul Mullowney 
731*9ae82921SPaul Mullowney @*/
732*9ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
733*9ae82921SPaul Mullowney {
734*9ae82921SPaul Mullowney   PetscErrorCode ierr;
735*9ae82921SPaul Mullowney 
736*9ae82921SPaul Mullowney   PetscFunctionBegin;
737*9ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
738*9ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
739*9ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
740*9ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
741*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
742*9ae82921SPaul Mullowney }
743*9ae82921SPaul Mullowney 
744*9ae82921SPaul Mullowney #undef __FUNCT__
745*9ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
746*9ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
747*9ae82921SPaul Mullowney {
748*9ae82921SPaul Mullowney   PetscErrorCode        ierr;
749*9ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
750*9ae82921SPaul Mullowney 
751*9ae82921SPaul Mullowney   PetscFunctionBegin;
752*9ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
753*9ae82921SPaul Mullowney     // The regular matrices
754*9ae82921SPaul Mullowney     try {
755*9ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
756*9ae82921SPaul Mullowney 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
757*9ae82921SPaul Mullowney       }
758*9ae82921SPaul Mullowney       if (cusparseMat->tempvec!=0)
759*9ae82921SPaul Mullowney 	delete cusparseMat->tempvec;
760*9ae82921SPaul Mullowney       delete cusparseMat;
761*9ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
762*9ae82921SPaul Mullowney     } catch(char* ex) {
763*9ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
764*9ae82921SPaul Mullowney     }
765*9ae82921SPaul Mullowney   } else {
766*9ae82921SPaul Mullowney     // The triangular factors
767*9ae82921SPaul Mullowney     try {
768*9ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
769*9ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
770*9ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
771*9ae82921SPaul Mullowney       // destroy the matrix and the container
772*9ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatLo;
773*9ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatUp;
774*9ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
775*9ae82921SPaul Mullowney       delete cusparseTriFactors;
776*9ae82921SPaul Mullowney     } catch(char* ex) {
777*9ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
778*9ae82921SPaul Mullowney     }
779*9ae82921SPaul Mullowney   }
780*9ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
781*9ae82921SPaul Mullowney     cusparseStatus_t stat;
782*9ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
783*9ae82921SPaul Mullowney     MAT_cusparseHandle=0;
784*9ae82921SPaul Mullowney   }
785*9ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
786*9ae82921SPaul Mullowney   A->spptr = 0;
787*9ae82921SPaul Mullowney 
788*9ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
789*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
790*9ae82921SPaul Mullowney }
791*9ae82921SPaul Mullowney 
792*9ae82921SPaul Mullowney 
793*9ae82921SPaul Mullowney EXTERN_C_BEGIN
794*9ae82921SPaul Mullowney #undef __FUNCT__
795*9ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
796*9ae82921SPaul Mullowney PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
797*9ae82921SPaul Mullowney {
798*9ae82921SPaul Mullowney   PetscErrorCode ierr;
799*9ae82921SPaul Mullowney 
800*9ae82921SPaul Mullowney   PetscFunctionBegin;
801*9ae82921SPaul Mullowney   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
802*9ae82921SPaul Mullowney   B->ops->mult    = MatMult_SeqAIJCUSPARSE;
803*9ae82921SPaul Mullowney   B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
804*9ae82921SPaul Mullowney 
805*9ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
806*9ae82921SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.*/
807*9ae82921SPaul Mullowney     B->spptr        = new Mat_SeqAIJCUSPARSE;
808*9ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
809*9ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
810*9ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = CSR;
811*9ae82921SPaul Mullowney   } else {
812*9ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
813*9ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
814*9ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
815*9ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
816*9ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
817*9ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = CSR;
818*9ae82921SPaul Mullowney   }
819*9ae82921SPaul Mullowney   // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...)
820*9ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
821*9ae82921SPaul Mullowney     cusparseStatus_t stat;
822*9ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
823*9ae82921SPaul Mullowney   }
824*9ae82921SPaul Mullowney   // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
825*9ae82921SPaul Mullowney   // default cusparse tri solve. Note the difference with the implementation in
826*9ae82921SPaul Mullowney   // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu
827*9ae82921SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
828*9ae82921SPaul Mullowney   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
829*9ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
830*9ae82921SPaul Mullowney   B->ops->getvecs        = MatGetVecs_SeqAIJCUSPARSE;
831*9ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
832*9ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
833*9ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
834*9ae82921SPaul Mullowney   PetscFunctionReturn(0);
835*9ae82921SPaul Mullowney }
836*9ae82921SPaul Mullowney EXTERN_C_END
837*9ae82921SPaul Mullowney 
838