xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2205254efee3a00a594e5e2a3a70f74dcb40bc03)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 /* this is such a hack ... but I don't know of another way to pass this variable
18    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19    SpMV. Essentially, I need to use the same stream variable in two different
20    data structures. I do this by creating a single instance of that stream
21    and reuse it. */
22 cudaStream_t theBodyStream=0;
23 
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 
36 #undef __FUNCT__
37 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
38 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
39 {
40   PetscFunctionBegin;
41   *type = MATSOLVERCUSPARSE;
42   PetscFunctionReturn(0);
43 }
44 
45 EXTERN_C_BEGIN
46 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
47 EXTERN_C_END
48 /*
49   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
50   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
51 
52    Level: beginner
53 */
54 
55 EXTERN_C_BEGIN
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
58 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
59 {
60   PetscErrorCode ierr;
61 
62   PetscFunctionBegin;
63   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
64   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
65     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
66     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
67     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
68 
69     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
70     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
71   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
72   (*B)->factortype = ftype;
73   PetscFunctionReturn(0);
74 }
75 EXTERN_C_END
76 
77 EXTERN_C_BEGIN
78 #undef __FUNCT__
79 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
80 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
81 {
82   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
83 
84   PetscFunctionBegin;
85   switch (op) {
86   case MAT_CUSPARSE_MULT:
87     cusparseMat->format = format;
88     break;
89   case MAT_CUSPARSE_SOLVE:
90     cusparseMatSolveStorageFormat = format;
91     break;
92   case MAT_CUSPARSE_ALL:
93     cusparseMat->format           = format;
94     cusparseMatSolveStorageFormat = format;
95     break;
96   default:
97     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
98   }
99   PetscFunctionReturn(0);
100 }
101 EXTERN_C_END
102 
103 
104 /*@
105    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
106    operation. Only the MatMult operation can use different GPU storage formats
107    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
108    to build/install PETSc to use this package.
109 
110    Not Collective
111 
112    Input Parameters:
113 +  A - Matrix of type SEQAIJCUSPARSE
114 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
115 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
116 
117    Output Parameter:
118 
119    Level: intermediate
120 
121 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
122 @*/
123 #undef __FUNCT__
124 #define __FUNCT__ "MatCUSPARSESetFormat"
125 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
126 {
127   PetscErrorCode ierr;
128 
129   PetscFunctionBegin;
130   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
131   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
132   PetscFunctionReturn(0);
133 }
134 
135 #undef __FUNCT__
136 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
137 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
138 {
139   PetscErrorCode           ierr;
140   MatCUSPARSEStorageFormat format;
141   PetscBool                flg;
142 
143   PetscFunctionBegin;
144   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
145   ierr = PetscObjectOptionsBegin((PetscObject)A);
146   if (A->factortype==MAT_FACTOR_NONE) {
147     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
148                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
149     if (flg) {
150       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
151     }
152   } else {
153     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
154                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
155     if (flg) {
156       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
157     }
158   }
159   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
160                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
161   if (flg) {
162     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
163   }
164   ierr = PetscOptionsEnd();CHKERRQ(ierr);
165   PetscFunctionReturn(0);
166 
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
171 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
172 {
173   PetscErrorCode ierr;
174 
175   PetscFunctionBegin;
176   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
177 
178   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
179   PetscFunctionReturn(0);
180 }
181 
182 #undef __FUNCT__
183 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
184 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
185 {
186   PetscErrorCode ierr;
187 
188   PetscFunctionBegin;
189   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
190 
191   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
192   PetscFunctionReturn(0);
193 }
194 
195 #undef __FUNCT__
196 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
197 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
198 {
199   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
200   PetscInt                     n                   = A->rmap->n;
201   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
202   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
203   cusparseStatus_t             stat;
204   const PetscInt               *ai = a->i,*aj = a->j,*vi;
205   const MatScalar              *aa = a->a,*v;
206   PetscErrorCode               ierr;
207   PetscInt                     *AiLo, *AjLo;
208   PetscScalar                  *AALo;
209   PetscInt                     i,nz, nzLower, offset, rowOffset;
210 
211   PetscFunctionBegin;
212   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
213     try {
214       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
215       nzLower=n+ai[n]-ai[1];
216 
217       /* Allocate Space for the lower triangular matrix */
218       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
219       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
220       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
221 
222       /* Fill the lower triangular matrix */
223       AiLo[0]  = (PetscInt) 0;
224       AiLo[n]  = nzLower;
225       AjLo[0]  = (PetscInt) 0;
226       AALo[0]  = (MatScalar) 1.0;
227       v        = aa;
228       vi       = aj;
229       offset   = 1;
230       rowOffset= 1;
231       for (i=1; i<n; i++) {
232         nz = ai[i+1] - ai[i];
233         /* additional 1 for the term on the diagonal */
234         AiLo[i]    = rowOffset;
235         rowOffset += nz+1;
236 
237         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
238         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
239 
240         offset      += nz;
241         AjLo[offset] = (PetscInt) i;
242         AALo[offset] = (MatScalar) 1.0;
243         offset      += 1;
244 
245         v  += nz;
246         vi += nz;
247       }
248       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
249 
250       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
251       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
252       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
253 
254       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
255 
256       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
257       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
258       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
259     } catch(char* ex) {
260       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
261     }
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 #undef __FUNCT__
267 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
268 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
269 {
270   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
271   PetscInt                     n                   = A->rmap->n;
272   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
273   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
274   cusparseStatus_t             stat;
275   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
276   const MatScalar              *aa = a->a,*v;
277   PetscInt                     *AiUp, *AjUp;
278   PetscScalar                  *AAUp;
279   PetscInt                     i,nz, nzUpper, offset;
280   PetscErrorCode               ierr;
281 
282   PetscFunctionBegin;
283   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
284     try {
285       /* next, figure out the number of nonzeros in the upper triangular matrix. */
286       nzUpper = adiag[0]-adiag[n];
287 
288       /* Allocate Space for the upper triangular matrix */
289       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
290       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
291       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
292 
293       /* Fill the upper triangular matrix */
294       AiUp[0]=(PetscInt) 0;
295       AiUp[n]=nzUpper;
296       offset = nzUpper;
297       for (i=n-1; i>=0; i--) {
298         v  = aa + adiag[i+1] + 1;
299         vi = aj + adiag[i+1] + 1;
300 
301         /* number of elements NOT on the diagonal */
302         nz = adiag[i] - adiag[i+1]-1;
303 
304         /* decrement the offset */
305         offset -= (nz+1);
306 
307         /* first, set the diagonal elements */
308         AjUp[offset] = (PetscInt) i;
309         AAUp[offset] = 1./v[nz];
310         AiUp[i]      = AiUp[i+1] - (nz+1);
311 
312         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
313         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
314       }
315       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
316 
317       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
318       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
319       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
320 
321       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
322 
323       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
324       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
325       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
326     } catch(char* ex) {
327       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
328     }
329   }
330   PetscFunctionReturn(0);
331 }
332 
333 #undef __FUNCT__
334 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
335 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
336 {
337   PetscErrorCode               ierr;
338   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
339   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
340   IS                           isrow               = a->row,iscol = a->icol;
341   PetscBool                    row_identity,col_identity;
342   const PetscInt               *r,*c;
343   PetscInt                     n = A->rmap->n;
344 
345   PetscFunctionBegin;
346   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
347   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
348 
349   cusparseTriFactors->tempvec = new CUSPARRAY;
350   cusparseTriFactors->tempvec->resize(n);
351 
352   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
353   /*lower triangular indices */
354   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
355   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
356   if (!row_identity) {
357     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
358   }
359   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
360 
361   /*upper triangular indices */
362   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
363   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
364   if (!col_identity) {
365     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
366   }
367   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 
371 #undef __FUNCT__
372 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
373 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
374 {
375   PetscErrorCode ierr;
376   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
377   IS             isrow = b->row,iscol = b->col;
378   PetscBool      row_identity,col_identity;
379 
380   PetscFunctionBegin;
381   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
382   /* determine which version of MatSolve needs to be used. */
383   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
384   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
385   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
386   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
387 
388   /* get the triangular factors */
389   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
390   PetscFunctionReturn(0);
391 }
392 
393 
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
397 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
398 {
399   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
400   PetscErrorCode               ierr;
401   CUSPARRAY                    *xGPU=PETSC_NULL, *bGPU=PETSC_NULL;
402   cusparseStatus_t             stat;
403   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
404   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
405   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
406   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
407 
408   PetscFunctionBegin;
409   /* Get the GPU pointers */
410   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
411   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
412 
413   /* solve with reordering */
414   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
415   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
416   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
417   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
418 
419   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
420   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
421   ierr = WaitForGPU();CHKERRCUSP(ierr);
422   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
423   PetscFunctionReturn(0);
424 }
425 
426 
427 
428 
429 #undef __FUNCT__
430 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
431 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
432 {
433   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
434   PetscErrorCode               ierr;
435   CUSPARRAY                    *xGPU=PETSC_NULL, *bGPU=PETSC_NULL;
436   cusparseStatus_t             stat;
437   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
438   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
439   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
440   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
441 
442   PetscFunctionBegin;
443   /* Get the GPU pointers */
444   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
445   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
446 
447   /* solve */
448   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
449   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
450 
451   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
452   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
453   ierr = WaitForGPU();CHKERRCUSP(ierr);
454   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
455   PetscFunctionReturn(0);
456 }
457 
458 #undef __FUNCT__
459 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
460 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
461 {
462 
463   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
464   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
465   PetscInt           m            = A->rmap->n,*ii,*ridx;
466   PetscErrorCode     ierr;
467 
468 
469   PetscFunctionBegin;
470   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
471     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
472     /*
473       It may be possible to reuse nonzero structure with new matrix values but
474       for simplicity and insured correctness we delete and build a new matrix on
475       the GPU. Likely a very small performance hit.
476     */
477     if (cusparseMat->mat) {
478       try {
479         delete cusparseMat->mat;
480         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
481 
482       } catch(char *ex) {
483         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
484       }
485     }
486     try {
487       cusparseMat->nonzerorow=0;
488       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
489 
490       if (a->compressedrow.use) {
491         m    = a->compressedrow.nrows;
492         ii   = a->compressedrow.i;
493         ridx = a->compressedrow.rindex;
494       } else {
495         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
496         int k=0;
497         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
498         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
499         ii[0]=0;
500         for (int j = 0; j<m; j++) {
501           if ((a->i[j+1]-a->i[j])>0) {
502             ii[k]  = a->i[j];
503             ridx[k]= j;
504             k++;
505           }
506         }
507         ii[cusparseMat->nonzerorow] = a->nz;
508 
509         m = cusparseMat->nonzerorow;
510       }
511 
512       /* Build our matrix ... first determine the GPU storage type */
513       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
514 
515       /* Create the streams and events (if desired).  */
516       PetscMPIInt size;
517       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
518       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
519 
520       /* FILL MODE UPPER is irrelevant */
521       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
522 
523       /* lastly, build the matrix */
524       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
525       cusparseMat->mat->setCPRowIndices(ridx, m);
526       if (!a->compressedrow.use) {
527         ierr = PetscFree(ii);CHKERRQ(ierr);
528         ierr = PetscFree(ridx);CHKERRQ(ierr);
529       }
530       cusparseMat->tempvec = new CUSPARRAY;
531       cusparseMat->tempvec->resize(m);
532     } catch(char *ex) {
533       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
534     }
535     ierr = WaitForGPU();CHKERRCUSP(ierr);
536 
537     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
538 
539     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
540   }
541   PetscFunctionReturn(0);
542 }
543 
544 #undef __FUNCT__
545 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
546 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
547 {
548   PetscErrorCode ierr;
549 
550   PetscFunctionBegin;
551   if (right) {
552     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
553     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
554     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
555     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
556     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
557   }
558   if (left) {
559     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
560     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
561     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
562     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
563     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
564   }
565   PetscFunctionReturn(0);
566 }
567 
568 #undef __FUNCT__
569 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
570 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
571 {
572   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
573   PetscErrorCode     ierr;
574   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
575   CUSPARRAY          *xarray      =PETSC_NULL,*yarray=PETSC_NULL;
576 
577   PetscFunctionBegin;
578   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
579      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
580   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
581   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
582   try {
583     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
584   } catch (char *ex) {
585     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
586   }
587   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
588   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
589   if (!cusparseMat->mat->hasNonZeroStream()) {
590     ierr = WaitForGPU();CHKERRCUSP(ierr);
591   }
592   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
593   PetscFunctionReturn(0);
594 }
595 
596 
597 #undef __FUNCT__
598 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
599 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
600 {
601   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
602   PetscErrorCode     ierr;
603   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
604   CUSPARRAY          *xarray      =PETSC_NULL,*yarray=PETSC_NULL;
605 
606   PetscFunctionBegin;
607   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
608      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
609   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
610   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
611   try {
612 #if !defined(PETSC_USE_COMPLEX)
613     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
614 #else
615     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
616 #endif
617   } catch (char *ex) {
618     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
619   }
620   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
621   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
622   if (!cusparseMat->mat->hasNonZeroStream()) {
623     ierr = WaitForGPU();CHKERRCUSP(ierr);
624   }
625   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
626   PetscFunctionReturn(0);
627 }
628 
629 #undef __FUNCT__
630 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
631 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
632 {
633   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
634   PetscErrorCode     ierr;
635   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
636   CUSPARRAY          *xarray      = PETSC_NULL,*yarray=PETSC_NULL,*zarray=PETSC_NULL;
637 
638   PetscFunctionBegin;
639   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
640      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
641   try {
642     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
643     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
644     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
645     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
646 
647     /* multiply add */
648     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
649 
650     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
651     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
652     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
653 
654   } catch(char *ex) {
655     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656   }
657   ierr = WaitForGPU();CHKERRCUSP(ierr);
658   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
659   PetscFunctionReturn(0);
660 }
661 
662 #undef __FUNCT__
663 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
664 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
665 {
666   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
667   PetscErrorCode     ierr;
668   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
669   CUSPARRAY          *xarray,*yarray,*zarray;
670 
671   PetscFunctionBegin;
672   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
673      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
674   try {
675     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
676     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
677     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
678     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
679 
680     /* multiply add with matrix transpose */
681 #if !defined(PETSC_USE_COMPLEX)
682     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
683 #else
684     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
685 #endif
686 
687     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
688     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
689     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
690 
691   } catch(char *ex) {
692     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
693   }
694   ierr = WaitForGPU();CHKERRCUSP(ierr);
695   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
696   PetscFunctionReturn(0);
697 }
698 
699 #undef __FUNCT__
700 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
701 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
702 {
703   PetscErrorCode ierr;
704 
705   PetscFunctionBegin;
706   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
707   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
708   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
709   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
710   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
711   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
712   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
713   PetscFunctionReturn(0);
714 }
715 
716 /* --------------------------------------------------------------------------------*/
717 #undef __FUNCT__
718 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
719 /*@
720    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
721    (the default parallel PETSc format). This matrix will ultimately pushed down
722    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
723    assembly performance the user should preallocate the matrix storage by setting
724    the parameter nz (or the array nnz).  By setting these parameters accurately,
725    performance during matrix assembly can be increased by more than a factor of 50.
726 
727    Collective on MPI_Comm
728 
729    Input Parameters:
730 +  comm - MPI communicator, set to PETSC_COMM_SELF
731 .  m - number of rows
732 .  n - number of columns
733 .  nz - number of nonzeros per row (same for all rows)
734 -  nnz - array containing the number of nonzeros in the various rows
735          (possibly different for each row) or PETSC_NULL
736 
737    Output Parameter:
738 .  A - the matrix
739 
740    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
741    MatXXXXSetPreallocation() paradgm instead of this routine directly.
742    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
743 
744    Notes:
745    If nnz is given then nz is ignored
746 
747    The AIJ format (also called the Yale sparse matrix format or
748    compressed row storage), is fully compatible with standard Fortran 77
749    storage.  That is, the stored row and column indices can begin at
750    either one (as in Fortran) or zero.  See the users' manual for details.
751 
752    Specify the preallocated storage with either nz or nnz (not both).
753    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
754    allocation.  For large problems you MUST preallocate memory or you
755    will get TERRIBLE performance, see the users' manual chapter on matrices.
756 
757    By default, this format uses inodes (identical nodes) when possible, to
758    improve numerical efficiency of matrix-vector products and solves. We
759    search for consecutive rows with the same nonzero structure, thereby
760    reusing matrix information to achieve increased efficiency.
761 
762    Level: intermediate
763 
764 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
765 @*/
766 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
767 {
768   PetscErrorCode ierr;
769 
770   PetscFunctionBegin;
771   ierr = MatCreate(comm,A);CHKERRQ(ierr);
772   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
773   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
774   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 #undef __FUNCT__
779 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
780 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
781 {
782   PetscErrorCode     ierr;
783   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
784 
785   PetscFunctionBegin;
786   if (A->factortype==MAT_FACTOR_NONE) {
787     try {
788       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
789         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
790       }
791       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
792       delete cusparseMat;
793       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
794     } catch(char *ex) {
795       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
796     }
797   } else {
798     /* The triangular factors */
799     try {
800       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
801       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
802       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
803       delete (GPU_Matrix_Ifc*) cusparseMatLo;
804       delete (GPU_Matrix_Ifc*) cusparseMatUp;
805       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
806       delete cusparseTriFactors;
807     } catch(char *ex) {
808       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
809     }
810   }
811   if (MAT_cusparseHandle) {
812     cusparseStatus_t stat;
813     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
814 
815     MAT_cusparseHandle=0;
816   }
817   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
818   A->spptr = 0;
819 
820   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
821   PetscFunctionReturn(0);
822 }
823 
824 EXTERN_C_BEGIN
825 #undef __FUNCT__
826 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
827 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
828 {
829   PetscErrorCode ierr;
830 
831   PetscFunctionBegin;
832   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
833   if (B->factortype==MAT_FACTOR_NONE) {
834     /* you cannot check the inode.use flag here since the matrix was just created.
835        now build a GPU matrix data structure */
836     B->spptr = new Mat_SeqAIJCUSPARSE;
837 
838     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat     = 0;
839     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0;
840     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format  = MAT_CUSPARSE_CSR;
841   } else {
842     /* NEXT, set the pointers to the triangular factors */
843     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
844 
845     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
846     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
847     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
848     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
849   }
850   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
851   if (!MAT_cusparseHandle) {
852     cusparseStatus_t stat;
853     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
854   }
855   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
856      default cusparse tri solve. Note the difference with the implementation in
857      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
858   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
859 
860   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
861   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
862   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
863   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
864   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
865   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
866   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
867   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
868 
869   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
870 
871   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
872 
873   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
874   PetscFunctionReturn(0);
875 }
876 EXTERN_C_END
877 
878 /*M
879    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
880 
881    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
882    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
883    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
884    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
885    the different GPU storage formats.
886 
887    Options Database Keys:
888 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
889 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
890 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
891 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
892 
893   Level: beginner
894 
895 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
896 M*/
897