xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 087f32624287cccaebf63b858eabc4ad5aaff02a)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 
11 #include "../src/vec/vec/impls/dvecimpl.h"
12 #include "petsc-private/vecimpl.h"
13 PETSC_CUDA_EXTERN_C_END
14 #undef VecType
15 #include "cusparsematimpl.h"
16 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
17 
18 /* this is such a hack ... but I don't know of another way to pass this variable
19    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
20    SpMV. Essentially, I need to use the same stream variable in two different
21    data structures. I do this by creating a single instance of that stream
22    and reuse it. */
23 cudaStream_t theBodyStream=0;
24 
25 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
26 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
27 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
28 
29 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
30 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
31 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
32 
33 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
35 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
36 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
37 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
38 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
39 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
40 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
41 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
42 
43 #undef __FUNCT__
44 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
45 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
46 {
47   PetscFunctionBegin;
48   *type = MATSOLVERCUSPARSE;
49   PetscFunctionReturn(0);
50 }
51 
52 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
53 
54 
55 /*MC
56   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
57   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
58   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
59   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
60   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
61   algorithms are not recommended. This class does NOT support direct solver operations.
62 
63   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE
64 
65   Consult CUSPARSE documentation for more information about the matrix storage formats
66   which correspond to the options database keys below.
67 
68    Options Database Keys:
69 .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
70 
71   Level: beginner
72 
73 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
74 M*/
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
78 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
84   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
85   ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
86   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
87     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
88     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
89   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
90     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
91     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
92   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
93   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
94   (*B)->factortype = ftype;
95   PetscFunctionReturn(0);
96 }
97 
98 #undef __FUNCT__
99 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
100 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
101 {
102   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
103 
104   PetscFunctionBegin;
105   switch (op) {
106   case MAT_CUSPARSE_MULT:
107     cusparseMat->format = format;
108     break;
109   case MAT_CUSPARSE_SOLVE:
110     cusparseMatSolveStorageFormat = format;
111     break;
112   case MAT_CUSPARSE_ALL:
113     cusparseMat->format           = format;
114     cusparseMatSolveStorageFormat = format;
115     break;
116   default:
117     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
118   }
119   PetscFunctionReturn(0);
120 }
121 
122 /*@
123    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
124    operation. Only the MatMult operation can use different GPU storage formats
125    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
126    to build/install PETSc to use this package.
127 
128    Not Collective
129 
130    Input Parameters:
131 +  A - Matrix of type SEQAIJCUSPARSE
132 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
133 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
134 
135    Output Parameter:
136 
137    Level: intermediate
138 
139 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
140 @*/
141 #undef __FUNCT__
142 #define __FUNCT__ "MatCUSPARSESetFormat"
143 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
144 {
145   PetscErrorCode ierr;
146 
147   PetscFunctionBegin;
148   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
149   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 #undef __FUNCT__
154 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
155 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
156 {
157   PetscErrorCode           ierr;
158   MatCUSPARSEStorageFormat format;
159   PetscBool                flg;
160 
161   PetscFunctionBegin;
162   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
163   ierr = PetscObjectOptionsBegin((PetscObject)A);
164   if (A->factortype==MAT_FACTOR_NONE) {
165     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
166                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
167     if (flg) {
168       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
169     }
170   } else {
171     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
172                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
173     if (flg) {
174       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
175     }
176   }
177   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
178                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179   if (flg) {
180     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
181   }
182   ierr = PetscOptionsEnd();CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 
185 }
186 
187 #undef __FUNCT__
188 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
189 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
195   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
201 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
202 {
203   PetscErrorCode ierr;
204 
205   PetscFunctionBegin;
206   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
207   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
208   PetscFunctionReturn(0);
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 #undef __FUNCT__
224 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 #undef __FUNCT__
236 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
240   PetscInt                     n                   = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t             stat;
244   const PetscInt               *ai = a->i,*aj = a->j,*vi;
245   const MatScalar              *aa = a->a,*v;
246   PetscErrorCode               ierr;
247   PetscInt                     *AiLo, *AjLo;
248   PetscScalar                  *AALo;
249   PetscInt                     i,nz, nzLower, offset, rowOffset;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
278         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
289 
290       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
291       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
292       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
293 
294       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
295 
296       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
297       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
298       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
299     } catch(char *ex) {
300       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
301     }
302   }
303   PetscFunctionReturn(0);
304 }
305 
306 #undef __FUNCT__
307 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
308 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
309 {
310   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
311   PetscInt                     n                   = A->rmap->n;
312   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
313   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
314   cusparseStatus_t             stat;
315   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
316   const MatScalar              *aa = a->a,*v;
317   PetscInt                     *AiUp, *AjUp;
318   PetscScalar                  *AAUp;
319   PetscInt                     i,nz, nzUpper, offset;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
324     try {
325       /* next, figure out the number of nonzeros in the upper triangular matrix. */
326       nzUpper = adiag[0]-adiag[n];
327 
328       /* Allocate Space for the upper triangular matrix */
329       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
330       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
331       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
332 
333       /* Fill the upper triangular matrix */
334       AiUp[0]=(PetscInt) 0;
335       AiUp[n]=nzUpper;
336       offset = nzUpper;
337       for (i=n-1; i>=0; i--) {
338         v  = aa + adiag[i+1] + 1;
339         vi = aj + adiag[i+1] + 1;
340 
341         /* number of elements NOT on the diagonal */
342         nz = adiag[i] - adiag[i+1]-1;
343 
344         /* decrement the offset */
345         offset -= (nz+1);
346 
347         /* first, set the diagonal elements */
348         AjUp[offset] = (PetscInt) i;
349         AAUp[offset] = 1./v[nz];
350         AiUp[i]      = AiUp[i+1] - (nz+1);
351 
352         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
353         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
354       }
355       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
356 
357       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
358       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
359       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
360 
361       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
362 
363       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
364       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
365       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
366     } catch(char *ex) {
367       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
368     }
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 #undef __FUNCT__
374 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
375 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
376 {
377   PetscErrorCode               ierr;
378   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
379   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
380   IS                           isrow               = a->row,iscol = a->icol;
381   PetscBool                    row_identity,col_identity;
382   const PetscInt               *r,*c;
383   PetscInt                     n = A->rmap->n;
384 
385   PetscFunctionBegin;
386   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
387   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
388 
389   cusparseTriFactors->tempvec = new CUSPARRAY;
390   cusparseTriFactors->tempvec->resize(n);
391 
392   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
393   /*lower triangular indices */
394   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
395   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
396   if (!row_identity) {
397     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
398   }
399   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
400 
401   /*upper triangular indices */
402   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
403   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
404   if (!col_identity) {
405     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
406   }
407   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
408   PetscFunctionReturn(0);
409 }
410 
411 #undef __FUNCT__
412 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
413 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
414 {
415   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
416   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
417   GPU_Matrix_Ifc               * cusparseMatLo       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
418   GPU_Matrix_Ifc               * cusparseMatUp       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
419   cusparseStatus_t             stat;
420   PetscErrorCode               ierr;
421   PetscInt                     *AiUp, *AjUp;
422   PetscScalar                  *AAUp;
423   PetscScalar                  *AALo;
424   PetscInt          nzUpper=a->nz,n=A->rmap->n,i,offset,nz,j;
425   Mat_SeqSBAIJ      *b = (Mat_SeqSBAIJ*)A->data;
426   const PetscInt    *ai=b->i,*aj=b->j,*vj;
427   const MatScalar   *aa=b->a,*v;
428 
429   PetscFunctionBegin;
430   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
431     try {
432       /* Allocate Space for the upper triangular matrix */
433       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
434       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
435       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
436       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
437 
438       /* Fill the upper triangular matrix */
439       AiUp[0]=(PetscInt) 0;
440       AiUp[n]=nzUpper;
441       offset = 0;
442       for (i=0; i<n; i++) {
443         /* set the pointers */
444         v  = aa + ai[i];
445         vj = aj + ai[i];
446         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
447 
448         /* first, set the diagonal elements */
449         AjUp[offset] = (PetscInt) i;
450         AAUp[offset] = 1.0/v[nz];
451         AiUp[i]      = offset;
452         AALo[offset] = 1.0/v[nz];
453 
454         offset+=1;
455         if (nz>0) {
456           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
457           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
458           for (j=offset; j<offset+nz; j++) {
459             AAUp[j] = -AAUp[j];
460             AALo[j] = AAUp[j]/v[nz];
461           }
462           offset+=nz;
463         }
464       }
465 
466       /* Build the upper triangular piece */
467       cusparseMatUp = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
468       stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
469       ierr = cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
470       stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat);
471       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp;
472 
473       /* Build the lower triangular piece */
474       cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
475       stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
476       ierr = cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr);
477       stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat);
478       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo;
479 
480       /* set this flag ... for performance logging */
481       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE;
482 
483       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
484       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
485       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
486       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
487       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
488     } catch(char *ex) {
489       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
490     }
491   }
492   PetscFunctionReturn(0);
493 }
494 
495 #undef __FUNCT__
496 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
497 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
498 {
499   PetscErrorCode               ierr;
500   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
501   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
502   IS             ip=a->row;
503   const PetscInt *rip;
504   PetscBool      perm_identity;
505   PetscInt                     n = A->rmap->n;
506 
507   PetscFunctionBegin;
508   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
509   cusparseTriFactors->tempvec = new CUSPARRAY;
510   cusparseTriFactors->tempvec->resize(n);
511   /*lower triangular indices */
512   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
513   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
514   if (!perm_identity) {
515     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
516     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
517   }
518   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
519   PetscFunctionReturn(0);
520 }
521 
522 #undef __FUNCT__
523 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
524 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
525 {
526   PetscErrorCode ierr;
527   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
528   IS             isrow = b->row,iscol = b->col;
529   PetscBool      row_identity,col_identity;
530 
531   PetscFunctionBegin;
532   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
533   /* determine which version of MatSolve needs to be used. */
534   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
535   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
536   if (row_identity && col_identity) {
537     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
538     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
539   } else {
540     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
541     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
542   }
543 
544   /* get the triangular factors */
545   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
546   PetscFunctionReturn(0);
547 }
548 
549 #undef __FUNCT__
550 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
551 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
552 {
553   PetscErrorCode ierr;
554   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
555   IS             ip=b->row;
556   PetscBool      perm_identity;
557 
558   PetscFunctionBegin;
559   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
560 
561   /* determine which version of MatSolve needs to be used. */
562   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
563   if (perm_identity) {
564     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
565     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
566   } else {
567     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
568     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
569   }
570 
571   /* get the triangular factors */
572   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 
576 #undef __FUNCT__
577 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
578 PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
579 {
580   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
581   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
582   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
583   cusparseStatus_t stat;
584   PetscFunctionBegin;
585   stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle,
586                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
587                                                        CUSPARSE_FILL_MODE_UPPER,
588                                                        CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
589   stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat);
590 
591   stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle,
592                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
593                                                        CUSPARSE_FILL_MODE_LOWER,
594                                                        CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
595   stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat);
596   PetscFunctionReturn(0);
597 }
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
601 PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
602 {
603   PetscErrorCode     ierr;
604   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
605   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
606   cusparseStatus_t stat;
607   PetscFunctionBegin;
608   if (cusparseMat->isSymmOrHerm) {
609     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
610   } else {
611     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
612   }
613   ierr = cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr);
614   PetscFunctionReturn(0);
615 }
616 
617 #undef __FUNCT__
618 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
619 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
620 {
621   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
622   PetscErrorCode ierr;
623   CUSPARRAY      *xGPU, *bGPU;
624   cusparseStatus_t stat;
625   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
626   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
627   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
628   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
629 
630   PetscFunctionBegin;
631   /* Analyze the matrix ... on the fly */
632   if (!cusparseTriFactors->hasTranspose) {
633     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
634     cusparseTriFactors->hasTranspose=PETSC_TRUE;
635   }
636 
637   /* Get the GPU pointers */
638   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
639   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
640 
641   /* solve with reordering */
642   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
643   stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat);
644   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
645   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
646 
647   /* restore */
648   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
649   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
650   ierr = WaitForGPU();CHKERRCUSP(ierr);
651 
652   if (cusparseTriFactors->isSymmOrHerm) {
653     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
654   } else {
655     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
656   }
657   PetscFunctionReturn(0);
658 }
659 
660 #undef __FUNCT__
661 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
662 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
663 {
664   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
665   PetscErrorCode    ierr;
666   CUSPARRAY         *xGPU, *bGPU;
667   cusparseStatus_t stat;
668   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
669   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
670   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
671   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
672 
673   PetscFunctionBegin;
674   /* Analyze the matrix ... on the fly */
675   if (!cusparseTriFactors->hasTranspose) {
676     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
677     cusparseTriFactors->hasTranspose=PETSC_TRUE;
678   }
679 
680   /* Get the GPU pointers */
681   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
682   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
683 
684   /* solve */
685   stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat);
686   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
687 
688   /* restore */
689   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
690   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
691   ierr = WaitForGPU();CHKERRCUSP(ierr);
692   if (cusparseTriFactors->isSymmOrHerm) {
693     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
694   } else {
695     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
696   }
697   PetscFunctionReturn(0);
698 }
699 
700 
701 #undef __FUNCT__
702 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
703 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
704 {
705   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
706   PetscErrorCode               ierr;
707   CUSPARRAY                    *xGPU, *bGPU;
708   cusparseStatus_t             stat;
709   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
710   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
711   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
712   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
713 
714   PetscFunctionBegin;
715   /* Get the GPU pointers */
716   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
717   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
718 
719   /* solve with reordering */
720   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
721   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
722   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
723   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
724 
725   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
726   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
727   ierr = WaitForGPU();CHKERRCUSP(ierr);
728   if (cusparseTriFactors->isSymmOrHerm) {
729     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
730   } else {
731     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
732   }
733   PetscFunctionReturn(0);
734 }
735 
736 
737 
738 
739 #undef __FUNCT__
740 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
741 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
742 {
743   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
744   PetscErrorCode               ierr;
745   CUSPARRAY                    *xGPU, *bGPU;
746   cusparseStatus_t             stat;
747   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
748   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
749   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
750   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
751 
752   PetscFunctionBegin;
753   /* Get the GPU pointers */
754   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
755   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
756 
757   /* solve */
758   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
759   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
760 
761   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
762   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
763   ierr = WaitForGPU();CHKERRCUSP(ierr);
764   if (cusparseTriFactors->isSymmOrHerm) {
765     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
766   } else {
767     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
768   }
769   PetscFunctionReturn(0);
770 }
771 
772 #undef __FUNCT__
773 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
774 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
775 {
776 
777   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
778   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
779   PetscInt           m            = A->rmap->n,*ii,*ridx;
780   PetscErrorCode     ierr;
781   PetscBool          symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE;
782   PetscBool          symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE;
783 
784   PetscFunctionBegin;
785   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
786     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
787     /*
788       It may be possible to reuse nonzero structure with new matrix values but
789       for simplicity and insured correctness we delete and build a new matrix on
790       the GPU. Likely a very small performance hit.
791     */
792     if (cusparseMat->mat) {
793       try {
794         delete cusparseMat->mat;
795         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
796 
797       } catch(char *ex) {
798         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
799       }
800     }
801     try {
802       cusparseMat->nonzerorow=0;
803       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
804 
805       if (a->compressedrow.use) {
806         m    = a->compressedrow.nrows;
807         ii   = a->compressedrow.i;
808         ridx = a->compressedrow.rindex;
809       } else {
810         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
811         int k=0;
812         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
813         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
814         ii[0]=0;
815         for (int j = 0; j<m; j++) {
816           if ((a->i[j+1]-a->i[j])>0) {
817             ii[k]  = a->i[j];
818             ridx[k]= j;
819             k++;
820           }
821         }
822         ii[cusparseMat->nonzerorow] = a->nz;
823 
824         m = cusparseMat->nonzerorow;
825       }
826 
827       /* Build our matrix ... first determine the GPU storage type */
828       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
829 
830       /* Create the streams and events (if desired).  */
831       PetscMPIInt size;
832       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
833       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
834 
835       ierr = MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest);
836       if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) {
837 	/* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
838         cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
839         cusparseMat->isSymmOrHerm = PETSC_FALSE;
840       } else {
841         ierr = MatIsSymmetric(A,0.0,&symmetryTest);
842         if (symmetryTest) {
843           cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
844           cusparseMat->isSymmOrHerm = PETSC_TRUE;
845         } else {
846           ierr = MatIsHermitian(A,0.0,&hermitianTest);
847           if (hermitianTest) {
848             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
849             cusparseMat->isSymmOrHerm = PETSC_TRUE;
850           } else {
851             /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
852             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
853             cusparseMat->isSymmOrHerm = PETSC_FALSE;
854           }
855         }
856       }
857 
858       /* lastly, build the matrix */
859       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
860       cusparseMat->mat->setCPRowIndices(ridx, m);
861       if (!a->compressedrow.use) {
862         ierr = PetscFree(ii);CHKERRQ(ierr);
863         ierr = PetscFree(ridx);CHKERRQ(ierr);
864       }
865       cusparseMat->tempvec = new CUSPARRAY;
866       cusparseMat->tempvec->resize(m);
867     } catch(char *ex) {
868       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
869     }
870     ierr = WaitForGPU();CHKERRCUSP(ierr);
871 
872     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
873 
874     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
875   }
876   PetscFunctionReturn(0);
877 }
878 
879 #undef __FUNCT__
880 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
881 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
882 {
883   PetscErrorCode ierr;
884 
885   PetscFunctionBegin;
886   if (right) {
887     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
888     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
889     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
890     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
891     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
892   }
893   if (left) {
894     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
895     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
896     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
897     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
898     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
899   }
900   PetscFunctionReturn(0);
901 }
902 
903 #undef __FUNCT__
904 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
905 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
906 {
907   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
908   PetscErrorCode     ierr;
909   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
910   CUSPARRAY          *xarray,*yarray;
911 
912   PetscFunctionBegin;
913   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
914      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
915   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
916   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
917   try {
918     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
919   } catch (char *ex) {
920     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
921   }
922   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
923   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
924   if (!cusparseMat->mat->hasNonZeroStream()) {
925     ierr = WaitForGPU();CHKERRCUSP(ierr);
926   }
927   if (cusparseMat->isSymmOrHerm) {
928     ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
929   } else {
930     ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
931   }
932   PetscFunctionReturn(0);
933 }
934 
935 
936 #undef __FUNCT__
937 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
938 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
939 {
940   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
941   PetscErrorCode     ierr;
942   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
943   CUSPARRAY          *xarray,*yarray;
944 
945   PetscFunctionBegin;
946   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
947      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
948   if (!cusparseMat->hasTranspose) {
949     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
950     cusparseMat->hasTranspose=PETSC_TRUE;
951   }
952   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
953   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
954   try {
955     ierr = cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr);
956   } catch (char *ex) {
957     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
958   }
959   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
960   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
961   if (!cusparseMat->mat->hasNonZeroStream()) {
962     ierr = WaitForGPU();CHKERRCUSP(ierr);
963   }
964   if (cusparseMat->isSymmOrHerm) {
965     ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
966   } else {
967     ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
968   }
969   PetscFunctionReturn(0);
970 }
971 
972 #undef __FUNCT__
973 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
974 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
975 {
976   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
977   PetscErrorCode     ierr;
978   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
979   CUSPARRAY          *xarray,*yarray,*zarray;
980 
981   PetscFunctionBegin;
982   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
983      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
984   try {
985     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
986     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
987     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
988     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
989 
990     /* multiply add */
991     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
992 
993     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
994     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
995     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
996 
997   } catch(char *ex) {
998     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
999   }
1000   ierr = WaitForGPU();CHKERRCUSP(ierr);
1001   if (cusparseMat->isSymmOrHerm) {
1002     ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
1003   } else {
1004     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1005   }
1006   PetscFunctionReturn(0);
1007 }
1008 
1009 #undef __FUNCT__
1010 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1011 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1012 {
1013   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1014   PetscErrorCode     ierr;
1015   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1016   CUSPARRAY          *xarray,*yarray,*zarray;
1017 
1018   PetscFunctionBegin;
1019   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1020      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1021   if (!cusparseMat->hasTranspose) {
1022     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1023     cusparseMat->hasTranspose=PETSC_TRUE;
1024   }
1025   try {
1026     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1027     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1028     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1029     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1030 
1031     /* multiply add with matrix transpose */
1032     ierr = cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr);
1033 
1034     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1035     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1036     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1037 
1038   } catch(char *ex) {
1039     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1040   }
1041   ierr = WaitForGPU();CHKERRCUSP(ierr);
1042   if (cusparseMat->isSymmOrHerm) {
1043     ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
1044   } else {
1045     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1046   }
1047   PetscFunctionReturn(0);
1048 }
1049 
1050 #undef __FUNCT__
1051 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1052 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1053 {
1054   PetscErrorCode ierr;
1055 
1056   PetscFunctionBegin;
1057   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1058   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1059   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1060   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1061   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1062   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1063   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1064   PetscFunctionReturn(0);
1065 }
1066 
1067 /* --------------------------------------------------------------------------------*/
1068 #undef __FUNCT__
1069 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1070 /*@
1071    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1072    (the default parallel PETSc format). This matrix will ultimately pushed down
1073    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1074    assembly performance the user should preallocate the matrix storage by setting
1075    the parameter nz (or the array nnz).  By setting these parameters accurately,
1076    performance during matrix assembly can be increased by more than a factor of 50.
1077 
1078    Collective on MPI_Comm
1079 
1080    Input Parameters:
1081 +  comm - MPI communicator, set to PETSC_COMM_SELF
1082 .  m - number of rows
1083 .  n - number of columns
1084 .  nz - number of nonzeros per row (same for all rows)
1085 -  nnz - array containing the number of nonzeros in the various rows
1086          (possibly different for each row) or NULL
1087 
1088    Output Parameter:
1089 .  A - the matrix
1090 
1091    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1092    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1093    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1094 
1095    Notes:
1096    If nnz is given then nz is ignored
1097 
1098    The AIJ format (also called the Yale sparse matrix format or
1099    compressed row storage), is fully compatible with standard Fortran 77
1100    storage.  That is, the stored row and column indices can begin at
1101    either one (as in Fortran) or zero.  See the users' manual for details.
1102 
1103    Specify the preallocated storage with either nz or nnz (not both).
1104    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1105    allocation.  For large problems you MUST preallocate memory or you
1106    will get TERRIBLE performance, see the users' manual chapter on matrices.
1107 
1108    By default, this format uses inodes (identical nodes) when possible, to
1109    improve numerical efficiency of matrix-vector products and solves. We
1110    search for consecutive rows with the same nonzero structure, thereby
1111    reusing matrix information to achieve increased efficiency.
1112 
1113    Level: intermediate
1114 
1115 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1116 @*/
1117 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1118 {
1119   PetscErrorCode ierr;
1120 
1121   PetscFunctionBegin;
1122   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1123   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1124   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1125   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 #undef __FUNCT__
1130 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1131 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1132 {
1133   PetscErrorCode     ierr;
1134   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1135 
1136   PetscFunctionBegin;
1137   if (A->factortype==MAT_FACTOR_NONE) {
1138     try {
1139       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1140         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
1141       }
1142       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
1143       delete cusparseMat;
1144       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1145     } catch(char *ex) {
1146       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1147     }
1148   } else {
1149     /* The triangular factors */
1150     try {
1151       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1152       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
1153       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
1154       delete (GPU_Matrix_Ifc*) cusparseMatLo;
1155       delete (GPU_Matrix_Ifc*) cusparseMatUp;
1156       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
1157       delete cusparseTriFactors;
1158     } catch(char *ex) {
1159       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1160     }
1161   }
1162   if (MAT_cusparseHandle) {
1163     cusparseStatus_t stat;
1164     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
1165 
1166     MAT_cusparseHandle=0;
1167   }
1168   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1169   A->spptr = 0;
1170 
1171   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 #undef __FUNCT__
1176 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1177 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1178 {
1179   PetscErrorCode ierr;
1180 
1181   PetscFunctionBegin;
1182   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1183   if (B->factortype==MAT_FACTOR_NONE) {
1184     /* you cannot check the inode.use flag here since the matrix was just created.
1185        now build a GPU matrix data structure */
1186     B->spptr = new Mat_SeqAIJCUSPARSE;
1187 
1188     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1189     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
1190     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1191     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
1192     ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE;
1193   } else {
1194     /* NEXT, set the pointers to the triangular factors */
1195     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1196 
1197     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1198     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1199     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
1200     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
1201     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
1202     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm   = PETSC_FALSE;
1203   }
1204   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
1205   if (!MAT_cusparseHandle) {
1206     cusparseStatus_t stat;
1207     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
1208   }
1209   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1210      default cusparse tri solve. Note the difference with the implementation in
1211      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1212   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1213 
1214   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1215   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1216   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1217   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1218   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1219   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1220   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1221   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1222 
1223   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1224 
1225   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1226 
1227   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1228   PetscFunctionReturn(0);
1229 }
1230 
1231 /*M
1232    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1233 
1234    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1235    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1236    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1237    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1238    the different GPU storage formats.
1239 
1240    Options Database Keys:
1241 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1242 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1243 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1244 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1245 
1246   Level: beginner
1247 
1248 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1249 M*/
1250