xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 22a7b760d1bac7490ac2e75bf367dc393fb2e18c) !
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include "petscconf.h"
7 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include "../src/vec/vec/impls/dvecimpl.h"
10 #include "petsc-private/vecimpl.h"
11 #undef VecType
12 #include "cusparsematimpl.h"
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 /* this is such a hack ... but I don't know of another way to pass this variable
17    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18    SpMV. Essentially, I need to use the same stream variable in two different
19    data structures. I do this by creating a single instance of that stream
20    and reuse it. */
21 cudaStream_t theBodyStream=0;
22 
23 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
24 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
25 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
28 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
29 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
30 
31 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
33 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
35 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
36 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
37 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
39 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
40 
41 #undef __FUNCT__
42 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
43 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
44 {
45   PetscFunctionBegin;
46   *type = MATSOLVERCUSPARSE;
47   PetscFunctionReturn(0);
48 }
49 
50 /*MC
51   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
52   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
53   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
54   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
55   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
56   algorithms are not recommended. This class does NOT support direct solver operations.
57 
58   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE
59 
60   Consult CUSPARSE documentation for more information about the matrix storage formats
61   which correspond to the options database keys below.
62 
63    Options Database Keys:
64 .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
65 
66   Level: beginner
67 
68 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
69 M*/
70 
71 #undef __FUNCT__
72 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
73 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
74 {
75   PetscErrorCode ierr;
76   PetscInt       n = A->rmap->n;
77 
78   PetscFunctionBegin;
79   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
80   (*B)->factortype = ftype;
81   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
82   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
83 
84   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
85     ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
86     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
87     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
88   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
89     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
90     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
91   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
92 
93   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
94   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
95   PetscFunctionReturn(0);
96 }
97 
98 #undef __FUNCT__
99 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
100 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
101 {
102   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
103 
104   PetscFunctionBegin;
105   switch (op) {
106   case MAT_CUSPARSE_MULT:
107     cusparseMat->format = format;
108     break;
109   case MAT_CUSPARSE_SOLVE:
110     cusparseMatSolveStorageFormat = format;
111     break;
112   case MAT_CUSPARSE_ALL:
113     cusparseMat->format           = format;
114     cusparseMatSolveStorageFormat = format;
115     break;
116   default:
117     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
118   }
119   PetscFunctionReturn(0);
120 }
121 
122 /*@
123    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
124    operation. Only the MatMult operation can use different GPU storage formats
125    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
126    to build/install PETSc to use this package.
127 
128    Not Collective
129 
130    Input Parameters:
131 +  A - Matrix of type SEQAIJCUSPARSE
132 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
133 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
134 
135    Output Parameter:
136 
137    Level: intermediate
138 
139 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
140 @*/
141 #undef __FUNCT__
142 #define __FUNCT__ "MatCUSPARSESetFormat"
143 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
144 {
145   PetscErrorCode ierr;
146 
147   PetscFunctionBegin;
148   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
149   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 #undef __FUNCT__
154 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
155 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
156 {
157   PetscErrorCode           ierr;
158   MatCUSPARSEStorageFormat format;
159   PetscBool                flg;
160 
161   PetscFunctionBegin;
162   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
163   ierr = PetscObjectOptionsBegin((PetscObject)A);
164   if (A->factortype==MAT_FACTOR_NONE) {
165     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
166                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
167     if (flg) {
168       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
169     }
170   } else {
171     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
172                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
173     if (flg) {
174       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
175     }
176   }
177   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
178                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179   if (flg) {
180     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
181   }
182   ierr = PetscOptionsEnd();CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 
185 }
186 
187 #undef __FUNCT__
188 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
189 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
195   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
201 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
202 {
203   PetscErrorCode ierr;
204 
205   PetscFunctionBegin;
206   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
207   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
208   PetscFunctionReturn(0);
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 #undef __FUNCT__
224 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 #undef __FUNCT__
236 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
240   PetscInt                     n                   = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   GPU_Matrix_Ifc               *cusparseMat        = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t             stat;
244   const PetscInt               *ai = a->i,*aj = a->j,*vi;
245   const MatScalar              *aa = a->a,*v;
246   PetscInt                     *AiLo, *AjLo;
247   PetscScalar                  *AALo;
248   PetscInt                     i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode               ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
278         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
289 
290       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
291       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
292       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
293 
294       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
295 
296       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
297       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
298       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
299     } catch(char *ex) {
300       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
301     }
302   }
303   PetscFunctionReturn(0);
304 }
305 
306 #undef __FUNCT__
307 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
308 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
309 {
310   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
311   PetscInt                     n                   = A->rmap->n;
312   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
313   GPU_Matrix_Ifc               *cusparseMat        = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
314   cusparseStatus_t             stat;
315   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
316   const MatScalar              *aa = a->a,*v;
317   PetscInt                     *AiUp, *AjUp;
318   PetscScalar                  *AAUp;
319   PetscInt                     i,nz, nzUpper, offset;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
324     try {
325       /* next, figure out the number of nonzeros in the upper triangular matrix. */
326       nzUpper = adiag[0]-adiag[n];
327 
328       /* Allocate Space for the upper triangular matrix */
329       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
330       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
331       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
332 
333       /* Fill the upper triangular matrix */
334       AiUp[0]=(PetscInt) 0;
335       AiUp[n]=nzUpper;
336       offset = nzUpper;
337       for (i=n-1; i>=0; i--) {
338         v  = aa + adiag[i+1] + 1;
339         vi = aj + adiag[i+1] + 1;
340 
341         /* number of elements NOT on the diagonal */
342         nz = adiag[i] - adiag[i+1]-1;
343 
344         /* decrement the offset */
345         offset -= (nz+1);
346 
347         /* first, set the diagonal elements */
348         AjUp[offset] = (PetscInt) i;
349         AAUp[offset] = 1./v[nz];
350         AiUp[i]      = AiUp[i+1] - (nz+1);
351 
352         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
353         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
354       }
355       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
356 
357       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
358       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
359       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
360 
361       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
362 
363       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
364       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
365       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
366     } catch(char *ex) {
367       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
368     }
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 #undef __FUNCT__
374 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
375 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
376 {
377   PetscErrorCode               ierr;
378   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
379   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
380   IS                           isrow = a->row,iscol = a->icol;
381   PetscBool                    row_identity,col_identity;
382   const PetscInt               *r,*c;
383   PetscInt                     n = A->rmap->n;
384 
385   PetscFunctionBegin;
386   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
387   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
388 
389   cusparseTriFactors->tempvec = new CUSPARRAY;
390   cusparseTriFactors->tempvec->resize(n);
391 
392   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
393   /*lower triangular indices */
394   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
395   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
396   if (!row_identity) {
397     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
398   }
399   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
400 
401   /*upper triangular indices */
402   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
403   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
404   if (!col_identity) {
405     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
406   }
407   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
408   PetscFunctionReturn(0);
409 }
410 
411 #undef __FUNCT__
412 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
413 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
414 {
415   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
416   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
417   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
418   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
419   cusparseStatus_t             stat;
420   PetscErrorCode               ierr;
421   PetscInt                     *AiUp, *AjUp;
422   PetscScalar                  *AAUp;
423   PetscScalar                  *AALo;
424   PetscInt                     nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
425   Mat_SeqSBAIJ                 *b = (Mat_SeqSBAIJ*)A->data;
426   const PetscInt               *ai = b->i,*aj = b->j,*vj;
427   const MatScalar              *aa = b->a,*v;
428 
429   PetscFunctionBegin;
430   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
431     try {
432       /* Allocate Space for the upper triangular matrix */
433       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
434       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
435       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
436       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
437 
438       /* Fill the upper triangular matrix */
439       AiUp[0]=(PetscInt) 0;
440       AiUp[n]=nzUpper;
441       offset = 0;
442       for (i=0; i<n; i++) {
443         /* set the pointers */
444         v  = aa + ai[i];
445         vj = aj + ai[i];
446         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
447 
448         /* first, set the diagonal elements */
449         AjUp[offset] = (PetscInt) i;
450         AAUp[offset] = 1.0/v[nz];
451         AiUp[i]      = offset;
452         AALo[offset] = 1.0/v[nz];
453 
454         offset+=1;
455         if (nz>0) {
456           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
457           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
458           for (j=offset; j<offset+nz; j++) {
459             AAUp[j] = -AAUp[j];
460             AALo[j] = AAUp[j]/v[nz];
461           }
462           offset+=nz;
463         }
464       }
465 
466       /* Build the upper triangular piece */
467       cusparseMatUp = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
468       stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
469       ierr = cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
470       stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat);
471       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp;
472 
473       /* Build the lower triangular piece */
474       cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
475       stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
476       ierr = cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr);
477       stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat);
478       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo;
479 
480       /* set this flag ... for performance logging */
481       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE;
482 
483       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
484       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
485       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
486       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
487       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
488     } catch(char *ex) {
489       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
490     }
491   }
492   PetscFunctionReturn(0);
493 }
494 
495 #undef __FUNCT__
496 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
497 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
498 {
499   PetscErrorCode               ierr;
500   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
501   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
502   IS                           ip = a->row;
503   const PetscInt               *rip;
504   PetscBool                    perm_identity;
505   PetscInt                     n = A->rmap->n;
506 
507   PetscFunctionBegin;
508   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
509   cusparseTriFactors->tempvec = new CUSPARRAY;
510   cusparseTriFactors->tempvec->resize(n);
511   /*lower triangular indices */
512   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
513   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
514   if (!perm_identity) {
515     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
516     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
517   }
518   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
519   PetscFunctionReturn(0);
520 }
521 
522 #undef __FUNCT__
523 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
524 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
525 {
526   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
527   IS             isrow = b->row,iscol = b->col;
528   PetscBool      row_identity,col_identity;
529   PetscErrorCode ierr;
530 
531   PetscFunctionBegin;
532   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
533   /* determine which version of MatSolve needs to be used. */
534   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
535   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
536   if (row_identity && col_identity) {
537     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
538     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
539   } else {
540     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
541     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
542   }
543 
544   /* get the triangular factors */
545   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
546   PetscFunctionReturn(0);
547 }
548 
549 #undef __FUNCT__
550 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
551 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
552 {
553   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
554   IS             ip = b->row;
555   PetscBool      perm_identity;
556   PetscErrorCode ierr;
557 
558   PetscFunctionBegin;
559   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
560 
561   /* determine which version of MatSolve needs to be used. */
562   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
563   if (perm_identity) {
564     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
565     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
566   } else {
567     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
568     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
569   }
570 
571   /* get the triangular factors */
572   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 
576 #undef __FUNCT__
577 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
578 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
579 {
580   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
581   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
582   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
583   cusparseStatus_t             stat;
584 
585   PetscFunctionBegin;
586   stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle,
587                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
588                                                        CUSPARSE_FILL_MODE_UPPER,
589                                                        CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
590   stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat);
591 
592   stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle,
593                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
594                                                        CUSPARSE_FILL_MODE_LOWER,
595                                                        CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
596   stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat);
597   PetscFunctionReturn(0);
598 }
599 
600 #undef __FUNCT__
601 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
602 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
603 {
604   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
605   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
606   cusparseStatus_t   stat;
607   PetscErrorCode     ierr;
608 
609   PetscFunctionBegin;
610   if (cusparseMat->isSymmOrHerm) {
611     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
612   } else {
613     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
614   }
615   ierr = cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr);
616   PetscFunctionReturn(0);
617 }
618 
619 #undef __FUNCT__
620 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
621 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
622 {
623   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
624   CUSPARRAY                    *xGPU, *bGPU;
625   cusparseStatus_t             stat;
626   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
627   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
628   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
629   CUSPARRAY                    *tempGPU            = (CUSPARRAY*) cusparseTriFactors->tempvec;
630   PetscErrorCode               ierr;
631 
632   PetscFunctionBegin;
633   /* Analyze the matrix ... on the fly */
634   if (!cusparseTriFactors->hasTranspose) {
635     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
636     cusparseTriFactors->hasTranspose=PETSC_TRUE;
637   }
638 
639   /* Get the GPU pointers */
640   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
641   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
642 
643   /* solve with reordering */
644   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
645   stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat);
646   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
647   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
648 
649   /* restore */
650   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
651   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
652   ierr = WaitForGPU();CHKERRCUSP(ierr);
653 
654   if (cusparseTriFactors->isSymmOrHerm) {
655     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
656   } else {
657     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
658   }
659   PetscFunctionReturn(0);
660 }
661 
662 #undef __FUNCT__
663 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
664 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
665 {
666   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
667   CUSPARRAY                    *xGPU,*bGPU;
668   cusparseStatus_t             stat;
669   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
670   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
671   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
672   CUSPARRAY                    *tempGPU            = (CUSPARRAY*) cusparseTriFactors->tempvec;
673   PetscErrorCode               ierr;
674 
675   PetscFunctionBegin;
676   /* Analyze the matrix ... on the fly */
677   if (!cusparseTriFactors->hasTranspose) {
678     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
679     cusparseTriFactors->hasTranspose=PETSC_TRUE;
680   }
681 
682   /* Get the GPU pointers */
683   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
684   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
685 
686   /* solve */
687   stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat);
688   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
689 
690   /* restore */
691   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
692   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
693   ierr = WaitForGPU();CHKERRCUSP(ierr);
694   if (cusparseTriFactors->isSymmOrHerm) {
695     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
696   } else {
697     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
698   }
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
704 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
705 {
706   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
707   CUSPARRAY                    *xGPU,*bGPU;
708   cusparseStatus_t             stat;
709   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
710   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
711   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
712   CUSPARRAY                    *tempGPU            = (CUSPARRAY*)cusparseTriFactors->tempvec;
713   PetscErrorCode               ierr;
714 
715   PetscFunctionBegin;
716   /* Get the GPU pointers */
717   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
718   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
719 
720   /* solve with reordering */
721   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
722   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
723   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
724   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
725 
726   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
727   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
728   ierr = WaitForGPU();CHKERRCUSP(ierr);
729   if (cusparseTriFactors->isSymmOrHerm) {
730     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
731   } else {
732     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
733   }
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
739 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
740 {
741   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
742   CUSPARRAY                    *xGPU,*bGPU;
743   cusparseStatus_t             stat;
744   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
745   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
746   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
747   CUSPARRAY                    *tempGPU            = (CUSPARRAY*)cusparseTriFactors->tempvec;
748   PetscErrorCode               ierr;
749 
750   PetscFunctionBegin;
751   /* Get the GPU pointers */
752   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
753   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
754 
755   /* solve */
756   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
757   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
758 
759   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
760   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
761   ierr = WaitForGPU();CHKERRCUSP(ierr);
762   if (cusparseTriFactors->isSymmOrHerm) {
763     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
764   } else {
765     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
766   }
767   PetscFunctionReturn(0);
768 }
769 
770 #undef __FUNCT__
771 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
772 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
773 {
774 
775   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
776   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
777   PetscInt           m = A->rmap->n,*ii,*ridx;
778   PetscBool          symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE;
779   PetscBool          symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE;
780   PetscErrorCode     ierr;
781 
782   PetscFunctionBegin;
783   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
784     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
785     /*
786       It may be possible to reuse nonzero structure with new matrix values but
787       for simplicity and insured correctness we delete and build a new matrix on
788       the GPU. Likely a very small performance hit.
789     */
790     if (cusparseMat->mat) {
791       try {
792         delete cusparseMat->mat;
793         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
794 
795       } catch(char *ex) {
796         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
797       }
798     }
799     try {
800       cusparseMat->nonzerorow=0;
801       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
802 
803       if (a->compressedrow.use) {
804         m    = a->compressedrow.nrows;
805         ii   = a->compressedrow.i;
806         ridx = a->compressedrow.rindex;
807       } else {
808         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
809         int k=0;
810         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
811         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
812         ii[0]=0;
813         for (int j = 0; j<m; j++) {
814           if ((a->i[j+1]-a->i[j])>0) {
815             ii[k]  = a->i[j];
816             ridx[k]= j;
817             k++;
818           }
819         }
820         ii[cusparseMat->nonzerorow] = a->nz;
821 
822         m = cusparseMat->nonzerorow;
823       }
824 
825       /* Build our matrix ... first determine the GPU storage type */
826       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
827 
828       /* Create the streams and events (if desired).  */
829       PetscMPIInt size;
830       ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
831       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
832 
833       ierr = MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest);CHKERRQ(ierr);
834       if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) {
835 	/* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
836         cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
837         cusparseMat->isSymmOrHerm = PETSC_FALSE;
838       } else {
839         ierr = MatIsSymmetric(A,0.0,&symmetryTest);CHKERRQ(ierr);
840         if (symmetryTest) {
841           cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
842           cusparseMat->isSymmOrHerm = PETSC_TRUE;
843         } else {
844           ierr = MatIsHermitian(A,0.0,&hermitianTest);CHKERRQ(ierr);
845           if (hermitianTest) {
846             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
847             cusparseMat->isSymmOrHerm = PETSC_TRUE;
848           } else {
849             /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
850             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
851             cusparseMat->isSymmOrHerm = PETSC_FALSE;
852           }
853         }
854       }
855 
856       /* lastly, build the matrix */
857       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
858       cusparseMat->mat->setCPRowIndices(ridx, m);
859       if (!a->compressedrow.use) {
860         ierr = PetscFree(ii);CHKERRQ(ierr);
861         ierr = PetscFree(ridx);CHKERRQ(ierr);
862       }
863       cusparseMat->tempvec = new CUSPARRAY;
864       cusparseMat->tempvec->resize(m);
865     } catch(char *ex) {
866       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
867     }
868     ierr = WaitForGPU();CHKERRCUSP(ierr);
869 
870     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
871 
872     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
873   }
874   PetscFunctionReturn(0);
875 }
876 
877 #undef __FUNCT__
878 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
879 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
880 {
881   PetscErrorCode ierr;
882 
883   PetscFunctionBegin;
884   if (right) {
885     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
886     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
887     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
888     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
889     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
890   }
891   if (left) {
892     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
893     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
894     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
895     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
896     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
897   }
898   PetscFunctionReturn(0);
899 }
900 
901 #undef __FUNCT__
902 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
903 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
904 {
905   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
906   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
907   CUSPARRAY          *xarray,*yarray;
908   PetscErrorCode     ierr;
909 
910   PetscFunctionBegin;
911   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
912      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
913   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
914   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
915   try {
916     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
917   } catch (char *ex) {
918     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
919   }
920   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
921   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
922   if (!cusparseMat->mat->hasNonZeroStream()) {
923     ierr = WaitForGPU();CHKERRCUSP(ierr);
924   }
925   if (cusparseMat->isSymmOrHerm) {
926     ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
927   } else {
928     ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
929   }
930   PetscFunctionReturn(0);
931 }
932 
933 #undef __FUNCT__
934 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
935 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
936 {
937   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
938   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
939   CUSPARRAY          *xarray,*yarray;
940   PetscErrorCode     ierr;
941 
942   PetscFunctionBegin;
943   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
944      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
945   if (!cusparseMat->hasTranspose) {
946     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
947     cusparseMat->hasTranspose=PETSC_TRUE;
948   }
949   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
950   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
951   try {
952     ierr = cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr);
953   } catch (char *ex) {
954     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
955   }
956   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
957   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
958   if (!cusparseMat->mat->hasNonZeroStream()) {
959     ierr = WaitForGPU();CHKERRCUSP(ierr);
960   }
961   if (cusparseMat->isSymmOrHerm) {
962     ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
963   } else {
964     ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
965   }
966   PetscFunctionReturn(0);
967 }
968 
969 #undef __FUNCT__
970 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
971 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
972 {
973   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
974   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
975   CUSPARRAY          *xarray,*yarray,*zarray;
976   PetscErrorCode     ierr;
977 
978   PetscFunctionBegin;
979   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
980      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
981   try {
982     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
983     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
984     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
985     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
986 
987     /* multiply add */
988     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
989 
990     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
991     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
992     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
993 
994   } catch(char *ex) {
995     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
996   }
997   ierr = WaitForGPU();CHKERRCUSP(ierr);
998   if (cusparseMat->isSymmOrHerm) {
999     ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
1000   } else {
1001     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1002   }
1003   PetscFunctionReturn(0);
1004 }
1005 
1006 #undef __FUNCT__
1007 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1008 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1009 {
1010   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1011   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1012   CUSPARRAY          *xarray,*yarray,*zarray;
1013   PetscErrorCode     ierr;
1014 
1015   PetscFunctionBegin;
1016   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1017      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1018   if (!cusparseMat->hasTranspose) {
1019     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1020     cusparseMat->hasTranspose=PETSC_TRUE;
1021   }
1022   try {
1023     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1024     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1025     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1026     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1027 
1028     /* multiply add with matrix transpose */
1029     ierr = cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr);
1030 
1031     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1032     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1033     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1034 
1035   } catch(char *ex) {
1036     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1037   }
1038   ierr = WaitForGPU();CHKERRCUSP(ierr);
1039   if (cusparseMat->isSymmOrHerm) {
1040     ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
1041   } else {
1042     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1043   }
1044   PetscFunctionReturn(0);
1045 }
1046 
1047 #undef __FUNCT__
1048 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1049 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1050 {
1051   PetscErrorCode ierr;
1052 
1053   PetscFunctionBegin;
1054   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1055   if (A->factortype==MAT_FACTOR_NONE) {
1056     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1057   }
1058   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1059   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1060   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1061   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1062   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1063   PetscFunctionReturn(0);
1064 }
1065 
1066 /* --------------------------------------------------------------------------------*/
1067 #undef __FUNCT__
1068 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1069 /*@
1070    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1071    (the default parallel PETSc format). This matrix will ultimately pushed down
1072    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1073    assembly performance the user should preallocate the matrix storage by setting
1074    the parameter nz (or the array nnz).  By setting these parameters accurately,
1075    performance during matrix assembly can be increased by more than a factor of 50.
1076 
1077    Collective on MPI_Comm
1078 
1079    Input Parameters:
1080 +  comm - MPI communicator, set to PETSC_COMM_SELF
1081 .  m - number of rows
1082 .  n - number of columns
1083 .  nz - number of nonzeros per row (same for all rows)
1084 -  nnz - array containing the number of nonzeros in the various rows
1085          (possibly different for each row) or NULL
1086 
1087    Output Parameter:
1088 .  A - the matrix
1089 
1090    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1091    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1092    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1093 
1094    Notes:
1095    If nnz is given then nz is ignored
1096 
1097    The AIJ format (also called the Yale sparse matrix format or
1098    compressed row storage), is fully compatible with standard Fortran 77
1099    storage.  That is, the stored row and column indices can begin at
1100    either one (as in Fortran) or zero.  See the users' manual for details.
1101 
1102    Specify the preallocated storage with either nz or nnz (not both).
1103    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1104    allocation.  For large problems you MUST preallocate memory or you
1105    will get TERRIBLE performance, see the users' manual chapter on matrices.
1106 
1107    By default, this format uses inodes (identical nodes) when possible, to
1108    improve numerical efficiency of matrix-vector products and solves. We
1109    search for consecutive rows with the same nonzero structure, thereby
1110    reusing matrix information to achieve increased efficiency.
1111 
1112    Level: intermediate
1113 
1114 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1115 @*/
1116 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1117 {
1118   PetscErrorCode ierr;
1119 
1120   PetscFunctionBegin;
1121   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1122   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1123   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1124   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1125   PetscFunctionReturn(0);
1126 }
1127 
1128 #undef __FUNCT__
1129 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1130 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1131 {
1132   PetscErrorCode     ierr;
1133   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1134 
1135   PetscFunctionBegin;
1136   if (A->factortype==MAT_FACTOR_NONE) {
1137     try {
1138       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1139         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
1140       }
1141       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
1142       delete cusparseMat;
1143       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1144     } catch(char *ex) {
1145       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1146     }
1147   } else {
1148     /* The triangular factors */
1149     try {
1150       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1151       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
1152       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
1153       delete (GPU_Matrix_Ifc*) cusparseMatLo;
1154       delete (GPU_Matrix_Ifc*) cusparseMatUp;
1155       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
1156       delete cusparseTriFactors;
1157     } catch(char *ex) {
1158       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1159     }
1160   }
1161   if (MAT_cusparseHandle) {
1162     cusparseStatus_t stat;
1163     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
1164 
1165     MAT_cusparseHandle=0;
1166   }
1167   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1168   A->spptr = 0;
1169 
1170   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1176 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1177 {
1178   PetscErrorCode ierr;
1179 
1180   PetscFunctionBegin;
1181   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1182   if (B->factortype==MAT_FACTOR_NONE) {
1183     /* you cannot check the inode.use flag here since the matrix was just created.
1184        now build a GPU matrix data structure */
1185     B->spptr = new Mat_SeqAIJCUSPARSE;
1186     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1187     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
1188     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1189     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
1190     ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE;
1191   } else {
1192     /* NEXT, set the pointers to the triangular factors */
1193     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1194     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1195     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1196     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
1197     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
1198     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
1199     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm   = PETSC_FALSE;
1200   }
1201   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
1202   if (!MAT_cusparseHandle) {
1203     cusparseStatus_t stat;
1204     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
1205   }
1206   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1207      default cusparse tri solve. Note the difference with the implementation in
1208      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1209   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1210 
1211   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1212   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1213   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1214   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1215   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1216   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1217   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1218   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1219 
1220   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1221 
1222   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1223 
1224   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1225   PetscFunctionReturn(0);
1226 }
1227 
1228 /*M
1229    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1230 
1231    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1232    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1233    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1234    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1235    the different GPU storage formats.
1236 
1237    Options Database Keys:
1238 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1239 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1240 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1241 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1242 
1243   Level: beginner
1244 
1245 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1246 M*/
1247