xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision e057df023a5f66cc14e65b1f031598b072b0c988)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 /* this is such a hack ... but I don't know of another way to pass this variable
18    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19    SpMV. Essentially, I need to use the same stream variable in two different
20    data structures. I do this by creating a single instance of that stream
21    and reuse it. */
22 cudaStream_t theBodyStream=0;
23 
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 
36 #undef __FUNCT__
37 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
38 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
39 {
40   PetscFunctionBegin;
41   *type = MATSOLVERCUSPARSE;
42   PetscFunctionReturn(0);
43 }
44 
45 EXTERN_C_BEGIN
46 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
47 EXTERN_C_END
48 /*
49   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
50   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
51 
52    Level: beginner
53 */
54 
55 EXTERN_C_BEGIN
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
58 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
59 {
60   PetscErrorCode     ierr;
61 
62   PetscFunctionBegin;
63   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
64   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
65     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
66     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
67     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
68     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
69     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
70   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
71   (*B)->factortype = ftype;
72   PetscFunctionReturn(0);
73 }
74 EXTERN_C_END
75 
76 EXTERN_C_BEGIN
77 #undef __FUNCT__
78 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
79 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
80 {
81   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
82   PetscFunctionBegin;
83   switch (op) {
84   case MAT_CUSPARSE_MULT:
85     cusparseMat->format = format;
86     break;
87   case MAT_CUSPARSE_SOLVE:
88     cusparseMatSolveStorageFormat = format;
89     break;
90   case MAT_CUSPARSE_ALL:
91     cusparseMat->format = format;
92     cusparseMatSolveStorageFormat = format;
93     break;
94   default:
95     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
96   }
97   PetscFunctionReturn(0);
98 }
99 EXTERN_C_END
100 
101 
102 /*@
103    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
104    operation. Only the MatMult operation can use different GPU storage formats
105    for AIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
106    to build/install PETSc to use this package.
107 
108    Not Collective
109 
110    Input Parameters:
111 +  A : Matrix of type SEQAIJCUSPARSE
112 .  op : MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
113 -  format : MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
114 
115    Output Parameter:
116 
117    Level: intermediate
118 
119 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEARSEFormatOperation
120 @*/
121 #undef __FUNCT__
122 #define __FUNCT__ "MatCUSPARSESetFormat"
123 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
124 {
125   PetscErrorCode ierr;
126   PetscFunctionBegin;
127   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
128   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
129   PetscFunctionReturn(0);
130 }
131 
132 #undef __FUNCT__
133 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
134 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
135 {
136   PetscErrorCode     ierr;
137   MatCUSPARSEStorageFormat format;
138   PetscBool      flg;
139   PetscFunctionBegin;
140   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
141   ierr = PetscObjectOptionsBegin((PetscObject)A);
142   if (A->factortype==MAT_FACTOR_NONE) {
143     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
144 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
145     if (flg) {
146       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
147     }
148   }
149   else {
150     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
151 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
152     if (flg) {
153       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
154     }
155   }
156   ierr = PetscOptionsEnd();CHKERRQ(ierr);
157   PetscFunctionReturn(0);
158 
159 }
160 
161 #undef __FUNCT__
162 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
163 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
164 {
165   PetscErrorCode     ierr;
166 
167   PetscFunctionBegin;
168   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
169   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
170   PetscFunctionReturn(0);
171 }
172 
173 #undef __FUNCT__
174 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
175 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
176 {
177   PetscErrorCode     ierr;
178 
179   PetscFunctionBegin;
180   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
181   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
182   PetscFunctionReturn(0);
183 }
184 
185 #undef __FUNCT__
186 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
187 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
188 {
189   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
190   PetscInt          n = A->rmap->n;
191   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
192   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
193   cusparseStatus_t stat;
194   const PetscInt    *ai = a->i,*aj = a->j,*vi;
195   const MatScalar   *aa = a->a,*v;
196   PetscErrorCode     ierr;
197   PetscInt *AiLo, *AjLo;
198   PetscScalar *AALo;
199   PetscInt i,nz, nzLower, offset, rowOffset;
200 
201   PetscFunctionBegin;
202   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
203     try {
204       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
205       nzLower=n+ai[n]-ai[1];
206 
207       /* Allocate Space for the lower triangular matrix */
208       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
209       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
210       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
211 
212       /* Fill the lower triangular matrix */
213       AiLo[0]=(PetscInt) 0;
214       AiLo[n]=nzLower;
215       AjLo[0]=(PetscInt) 0;
216       AALo[0]=(MatScalar) 1.0;
217       v    = aa;
218       vi   = aj;
219       offset=1;
220       rowOffset=1;
221       for (i=1; i<n; i++) {
222 	nz  = ai[i+1] - ai[i];
223 	/* additional 1 for the term on the diagonal */
224 	AiLo[i]=rowOffset;
225 	rowOffset+=nz+1;
226 
227 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
228 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
229 
230 	offset+=nz;
231 	AjLo[offset]=(PetscInt) i;
232 	AALo[offset]=(MatScalar) 1.0;
233 	offset+=1;
234 
235 	v  += nz;
236 	vi += nz;
237       }
238       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
239       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
240       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
241       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
242       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
243       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
244       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
245       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
246     } catch(char* ex) {
247       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
248     }
249   }
250   PetscFunctionReturn(0);
251 }
252 
253 #undef __FUNCT__
254 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
255 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
256 {
257   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
258   PetscInt          n = A->rmap->n;
259   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
260   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
261   cusparseStatus_t stat;
262   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
263   const MatScalar   *aa = a->a,*v;
264   PetscInt *AiUp, *AjUp;
265   PetscScalar *AAUp;
266   PetscInt i,nz, nzUpper, offset;
267   PetscErrorCode     ierr;
268 
269   PetscFunctionBegin;
270 
271   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
272     try {
273       /* next, figure out the number of nonzeros in the upper triangular matrix. */
274       nzUpper = adiag[0]-adiag[n];
275 
276       /* Allocate Space for the upper triangular matrix */
277       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
278       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
279       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
280 
281       /* Fill the upper triangular matrix */
282       AiUp[0]=(PetscInt) 0;
283       AiUp[n]=nzUpper;
284       offset = nzUpper;
285       for (i=n-1; i>=0; i--){
286 	v   = aa + adiag[i+1] + 1;
287 	vi  = aj + adiag[i+1] + 1;
288 
289 	/* number of elements NOT on the diagonal */
290 	nz = adiag[i] - adiag[i+1]-1;
291 
292 	/* decrement the offset */
293 	offset -= (nz+1);
294 
295 	/* first, set the diagonal elements */
296 	AjUp[offset] = (PetscInt) i;
297 	AAUp[offset] = 1./v[nz];
298 	AiUp[i] = AiUp[i+1] - (nz+1);
299 
300 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
301 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
302       }
303       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
304       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
305       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
306       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
307       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
308       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
309       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
310       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
311     } catch(char* ex) {
312       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
313     }
314   }
315   PetscFunctionReturn(0);
316 }
317 
318 #undef __FUNCT__
319 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysiAndCopyToGPU"
320 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
321 {
322   PetscErrorCode     ierr;
323   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
324   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
325   IS               isrow = a->row,iscol = a->icol;
326   PetscBool        row_identity,col_identity;
327   const PetscInt   *r,*c;
328   PetscInt          n = A->rmap->n;
329 
330   PetscFunctionBegin;
331   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
332   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
333   cusparseTriFactors->tempvec = new CUSPARRAY;
334   cusparseTriFactors->tempvec->resize(n);
335 
336   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
337   /*lower triangular indices */
338   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
339   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
340   if (!row_identity)
341     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
342   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
343 
344   /*upper triangular indices */
345   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
346   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
347   if (!col_identity)
348     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
349   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
350   PetscFunctionReturn(0);
351 }
352 
353 #undef __FUNCT__
354 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
355 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
356 {
357   PetscErrorCode   ierr;
358   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
359   IS               isrow = b->row,iscol = b->col;
360   PetscBool        row_identity,col_identity;
361 
362   PetscFunctionBegin;
363   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
364   /* determine which version of MatSolve needs to be used. */
365   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
366   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
367   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
368   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
369 
370   /* get the triangular factors */
371   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
372   PetscFunctionReturn(0);
373 }
374 
375 
376 
377 #undef __FUNCT__
378 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
379 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
380 {
381   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
382   PetscErrorCode ierr;
383   CUSPARRAY      *xGPU, *bGPU;
384   cusparseStatus_t stat;
385   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
386   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
387   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
388   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
389 
390   PetscFunctionBegin;
391   /* Get the GPU pointers */
392   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
393   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
394 
395   /* solve with reordering */
396   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
397   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
398   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
399   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
400 
401   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
402   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
403   ierr = WaitForGPU();CHKERRCUSP(ierr);
404   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
405   PetscFunctionReturn(0);
406 }
407 
408 
409 
410 
411 #undef __FUNCT__
412 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
413 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
414 {
415   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
416   PetscErrorCode    ierr;
417   CUSPARRAY         *xGPU, *bGPU;
418   cusparseStatus_t stat;
419   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
420   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
421   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
422   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
423 
424   PetscFunctionBegin;
425   /* Get the GPU pointers */
426   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
427   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
428 
429   /* solve */
430   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
431   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
432 
433   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
434   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
435   ierr = WaitForGPU();CHKERRCUSP(ierr);
436   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
437   PetscFunctionReturn(0);
438 }
439 
440 #undef __FUNCT__
441 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
442 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
443 {
444 
445   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
446   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
447   PetscInt        m           = A->rmap->n,*ii,*ridx;
448   PetscErrorCode  ierr;
449 
450 
451   PetscFunctionBegin;
452   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
453     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
454     /*
455       It may be possible to reuse nonzero structure with new matrix values but
456       for simplicity and insured correctness we delete and build a new matrix on
457       the GPU. Likely a very small performance hit.
458     */
459     if (cusparseMat->mat){
460       try {
461 	delete cusparseMat->mat;
462 	if (cusparseMat->tempvec)
463 	  delete cusparseMat->tempvec;
464 
465       } catch(char* ex) {
466 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
467       }
468     }
469     try {
470       cusparseMat->nonzerorow=0;
471       for (int j = 0; j<m; j++)
472 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
473 
474       if (a->compressedrow.use) {
475 	m    = a->compressedrow.nrows;
476 	ii   = a->compressedrow.i;
477 	ridx = a->compressedrow.rindex;
478       } else {
479 	/* Forcing compressed row on the GPU ... only relevant for CSR storage */
480 	int k=0;
481 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
482 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
483 	ii[0]=0;
484 	for (int j = 0; j<m; j++) {
485 	  if ((a->i[j+1]-a->i[j])>0) {
486 	    ii[k] = a->i[j];
487 	    ridx[k]= j;
488 	    k++;
489 	  }
490 	}
491 	ii[cusparseMat->nonzerorow] = a->nz;
492 	m = cusparseMat->nonzerorow;
493       }
494 
495       /* Build our matrix ... first determine the GPU storage type */
496       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
497 
498       /* Create the streams and events (if desired).  */
499       PetscMPIInt    size;
500       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
501       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
502 
503       /* FILL MODE UPPER is irrelevant */
504       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
505 
506       /* lastly, build the matrix */
507       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
508       cusparseMat->mat->setCPRowIndices(ridx, m);
509       if (!a->compressedrow.use) {
510 	ierr = PetscFree(ii);CHKERRQ(ierr);
511 	ierr = PetscFree(ridx);CHKERRQ(ierr);
512       }
513       cusparseMat->tempvec = new CUSPARRAY;
514       cusparseMat->tempvec->resize(m);
515     } catch(char* ex) {
516       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
517     }
518     ierr = WaitForGPU();CHKERRCUSP(ierr);
519     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
520     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
521   }
522   PetscFunctionReturn(0);
523 }
524 
525 #undef __FUNCT__
526 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
527 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
528 {
529   PetscErrorCode ierr;
530 
531   PetscFunctionBegin;
532 
533   if (right) {
534     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
535     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
536     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
537     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
538     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
539   }
540   if (left) {
541     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
542     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
543     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
544     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
545     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
546   }
547   PetscFunctionReturn(0);
548 }
549 
550 #undef __FUNCT__
551 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
552 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
553 {
554   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
555   PetscErrorCode ierr;
556   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
557   CUSPARRAY      *xarray,*yarray;
558 
559   PetscFunctionBegin;
560   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
561      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
562   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
563   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
564   try {
565     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
566   } catch (char* ex) {
567     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
568   }
569   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
570   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
571   if (!cusparseMat->mat->hasNonZeroStream()) {
572     ierr = WaitForGPU();CHKERRCUSP(ierr);
573   }
574   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
575   PetscFunctionReturn(0);
576 }
577 
578 
579 #undef __FUNCT__
580 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
581 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
582 {
583   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
584   PetscErrorCode ierr;
585   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
586   CUSPARRAY      *xarray,*yarray;
587 
588   PetscFunctionBegin;
589   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
590      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
591   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
592   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
593   try {
594 #if !defined(PETSC_USE_COMPLEX)
595     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
596 #else
597     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
598 #endif
599   } catch (char* ex) {
600     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
601   }
602   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
603   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
604   if (!cusparseMat->mat->hasNonZeroStream()) {
605     ierr = WaitForGPU();CHKERRCUSP(ierr);
606   }
607   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
608   PetscFunctionReturn(0);
609 }
610 
611 #undef __FUNCT__
612 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
613 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
614 {
615   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
616   PetscErrorCode ierr;
617   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
618   CUSPARRAY      *xarray,*yarray,*zarray;
619   PetscFunctionBegin;
620   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
621      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
622   try {
623     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
624     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
625     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
626     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
627 
628     /* multiply add */
629     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
630 
631     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
632     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
633     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
634 
635   } catch(char* ex) {
636     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
637   }
638   ierr = WaitForGPU();CHKERRCUSP(ierr);
639   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
640   PetscFunctionReturn(0);
641 }
642 
643 #undef __FUNCT__
644 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
645 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
646 {
647   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
648   PetscErrorCode ierr;
649   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
650   CUSPARRAY      *xarray,*yarray,*zarray;
651   PetscFunctionBegin;
652   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
653      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
654   try {
655     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
656     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
657     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
658     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
659 
660     /* multiply add with matrix transpose */
661 #if !defined(PETSC_USE_COMPLEX)
662     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
663 #else
664     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
665 #endif
666 
667     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
668     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
669     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
670 
671   } catch(char* ex) {
672     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
673   }
674   ierr = WaitForGPU();CHKERRCUSP(ierr);
675   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
676   PetscFunctionReturn(0);
677 }
678 
679 #undef __FUNCT__
680 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
681 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
682 {
683   PetscErrorCode  ierr;
684   PetscFunctionBegin;
685   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
686   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
687   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
688   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
689   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
690   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
691   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
692   PetscFunctionReturn(0);
693 }
694 
695 /* --------------------------------------------------------------------------------*/
696 #undef __FUNCT__
697 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
698 /*@
699    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
700    (the default parallel PETSc format). This matrix will ultimately pushed down
701    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
702    assembly performance the user should preallocate the matrix storage by setting
703    the parameter nz (or the array nnz).  By setting these parameters accurately,
704    performance during matrix assembly can be increased by more than a factor of 50.
705 
706    Collective on MPI_Comm
707 
708    Input Parameters:
709 +  comm - MPI communicator, set to PETSC_COMM_SELF
710 .  m - number of rows
711 .  n - number of columns
712 .  nz - number of nonzeros per row (same for all rows)
713 -  nnz - array containing the number of nonzeros in the various rows
714          (possibly different for each row) or PETSC_NULL
715 
716    Output Parameter:
717 .  A - the matrix
718 
719    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
720    MatXXXXSetPreallocation() paradgm instead of this routine directly.
721    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
722 
723    Notes:
724    If nnz is given then nz is ignored
725 
726    The AIJ format (also called the Yale sparse matrix format or
727    compressed row storage), is fully compatible with standard Fortran 77
728    storage.  That is, the stored row and column indices can begin at
729    either one (as in Fortran) or zero.  See the users' manual for details.
730 
731    Specify the preallocated storage with either nz or nnz (not both).
732    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
733    allocation.  For large problems you MUST preallocate memory or you
734    will get TERRIBLE performance, see the users' manual chapter on matrices.
735 
736    By default, this format uses inodes (identical nodes) when possible, to
737    improve numerical efficiency of matrix-vector products and solves. We
738    search for consecutive rows with the same nonzero structure, thereby
739    reusing matrix information to achieve increased efficiency.
740 
741    Level: intermediate
742 
743 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
744 @*/
745 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
746 {
747   PetscErrorCode ierr;
748 
749   PetscFunctionBegin;
750   ierr = MatCreate(comm,A);CHKERRQ(ierr);
751   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
752   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
753   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
754   PetscFunctionReturn(0);
755 }
756 
757 #undef __FUNCT__
758 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
759 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
760 {
761   PetscErrorCode        ierr;
762   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
763 
764   PetscFunctionBegin;
765   if (A->factortype==MAT_FACTOR_NONE) {
766     try {
767       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
768 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
769       }
770       if (cusparseMat->tempvec!=0)
771 	delete cusparseMat->tempvec;
772       delete cusparseMat;
773       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
774     } catch(char* ex) {
775       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
776     }
777   } else {
778     /* The triangular factors */
779     try {
780       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
781       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
782       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
783       delete (GPU_Matrix_Ifc *)cusparseMatLo;
784       delete (GPU_Matrix_Ifc *)cusparseMatUp;
785       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
786       delete cusparseTriFactors;
787     } catch(char* ex) {
788       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
789     }
790   }
791   if (MAT_cusparseHandle) {
792     cusparseStatus_t stat;
793     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
794     MAT_cusparseHandle=0;
795   }
796   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
797   A->spptr = 0;
798 
799   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
800   PetscFunctionReturn(0);
801 }
802 
803 EXTERN_C_BEGIN
804 #undef __FUNCT__
805 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
806 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
807 {
808   PetscErrorCode ierr;
809 
810   PetscFunctionBegin;
811   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
812   if (B->factortype==MAT_FACTOR_NONE) {
813     /* you cannot check the inode.use flag here since the matrix was just created.
814        now build a GPU matrix data structure */
815     B->spptr        = new Mat_SeqAIJCUSPARSE;
816     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
817     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
818     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR;
819   } else {
820     /* NEXT, set the pointers to the triangular factors */
821     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
822     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
823     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
824     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
825     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat;
826   }
827   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
828   if (!MAT_cusparseHandle) {
829     cusparseStatus_t stat;
830     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
831   }
832   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
833      default cusparse tri solve. Note the difference with the implementation in
834      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
835   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
836   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
837   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
838   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
839   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
840   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
841   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
842   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
843   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
844   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
845   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
846   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
847   PetscFunctionReturn(0);
848 }
849 EXTERN_C_END
850 
851 /*M
852    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
853 
854    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
855    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
856    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
857    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
858    the different GPU storage formats.
859 
860    Options Database Keys:
861 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
862 .  -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
863 .  -mat_cusparse_mult_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
864 -  -mat_cusparse_solve_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
865 
866   Level: beginner
867 
868 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
869 M*/
870