xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision b06137fd3c61652c361eb575444d4624c7009b15)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include "petscconf.h"
7 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include "../src/vec/vec/impls/dvecimpl.h"
10 #include "petsc-private/vecimpl.h"
11 #undef VecType
12 #include "cusparsematimpl.h"
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 #undef __FUNCT__
35 #define __FUNCT__ "MatCUSPARSESetStream"
36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
37 {
38   cusparseStatus_t   stat;
39   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
40 
41   PetscFunctionBegin;
42   cusparsestruct->stream = stream;
43   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
44   PetscFunctionReturn(0);
45 }
46 
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCUSPARSESetHandle"
49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
50 {
51   cusparseStatus_t   stat;
52   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53 
54   PetscFunctionBegin;
55   if (cusparsestruct->handle)
56     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
57   cusparsestruct->handle = handle;
58   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
59   PetscFunctionReturn(0);
60 }
61 
62 #undef __FUNCT__
63 #define __FUNCT__ "MatCUSPARSEClearHandle"
64 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
65 {
66   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67   PetscFunctionBegin;
68   if (cusparsestruct->handle)
69     cusparsestruct->handle = 0;
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
76 {
77   PetscFunctionBegin;
78   *type = MATSOLVERCUSPARSE;
79   PetscFunctionReturn(0);
80 }
81 
82 /*MC
83   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88   algorithms are not recommended. This class does NOT support direct solver operations.
89 
90   Consult CUSPARSE documentation for more information about the matrix storage formats
91   which correspond to the options database keys below.
92 
93    Options Database Keys:
94 .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
95 
96   Level: beginner
97 
98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
99 M*/
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 #undef __FUNCT__
129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
133 
134   PetscFunctionBegin;
135   switch (op) {
136   case MAT_CUSPARSE_MULT:
137     cusparsestruct->format = format;
138     break;
139   case MAT_CUSPARSE_SOLVE:
140     cusparseMatSolveStorageFormat = format;
141     break;
142   case MAT_CUSPARSE_ALL:
143     cusparsestruct->format = format;
144     cusparseMatSolveStorageFormat = format;
145     break;
146   default:
147     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
148   }
149   PetscFunctionReturn(0);
150 }
151 
152 /*@
153    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
154    operation. Only the MatMult operation can use different GPU storage formats
155    for MPIAIJCUSPARSE matrices.
156    Not Collective
157 
158    Input Parameters:
159 +  A - Matrix of type SEQAIJCUSPARSE
160 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
161 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
162 
163    Output Parameter:
164 
165    Level: intermediate
166 
167 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
168 @*/
169 #undef __FUNCT__
170 #define __FUNCT__ "MatCUSPARSESetFormat"
171 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
172 {
173   PetscErrorCode ierr;
174 
175   PetscFunctionBegin;
176   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
177   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
178   PetscFunctionReturn(0);
179 }
180 
181 #undef __FUNCT__
182 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
183 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
184 {
185   PetscErrorCode           ierr;
186   MatCUSPARSEStorageFormat format;
187   PetscBool                flg;
188 
189   PetscFunctionBegin;
190   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
191   ierr = PetscObjectOptionsBegin((PetscObject)A);
192   if (A->factortype==MAT_FACTOR_NONE) {
193     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
194                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
195     if (flg) {
196       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
197     }
198   } else {
199     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
200                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
201     if (flg) {
202       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
203     }
204   }
205   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
206                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
207   if (flg) {
208     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
209   }
210   ierr = PetscOptionsEnd();CHKERRQ(ierr);
211   PetscFunctionReturn(0);
212 
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
217 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
223   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 #undef __FUNCT__
228 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
229 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
230 {
231   PetscErrorCode ierr;
232 
233   PetscFunctionBegin;
234   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
235   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
236   PetscFunctionReturn(0);
237 }
238 
239 #undef __FUNCT__
240 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
241 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
242 {
243   PetscErrorCode ierr;
244 
245   PetscFunctionBegin;
246   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
247   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
248   PetscFunctionReturn(0);
249 }
250 
251 #undef __FUNCT__
252 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
253 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
254 {
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
259   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
260   PetscFunctionReturn(0);
261 }
262 
263 #undef __FUNCT__
264 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
266 {
267   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
268   PetscInt                          n = A->rmap->n;
269   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
270   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
271   cusparseStatus_t                  stat;
272   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
273   const MatScalar                   *aa = a->a,*v;
274   PetscInt                          *AiLo, *AjLo;
275   PetscScalar                       *AALo;
276   PetscInt                          i,nz, nzLower, offset, rowOffset;
277   PetscErrorCode                    ierr;
278 
279   PetscFunctionBegin;
280   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
281     try {
282       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
283       nzLower=n+ai[n]-ai[1];
284 
285       /* Allocate Space for the lower triangular matrix */
286       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
287       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
288       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
289 
290       /* Fill the lower triangular matrix */
291       AiLo[0]  = (PetscInt) 0;
292       AiLo[n]  = nzLower;
293       AjLo[0]  = (PetscInt) 0;
294       AALo[0]  = (MatScalar) 1.0;
295       v        = aa;
296       vi       = aj;
297       offset   = 1;
298       rowOffset= 1;
299       for (i=1; i<n; i++) {
300         nz = ai[i+1] - ai[i];
301         /* additional 1 for the term on the diagonal */
302         AiLo[i]    = rowOffset;
303         rowOffset += nz+1;
304 
305         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
306         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
307 
308         offset      += nz;
309         AjLo[offset] = (PetscInt) i;
310         AALo[offset] = (MatScalar) 1.0;
311         offset      += 1;
312 
313         v  += nz;
314         vi += nz;
315       }
316 
317       /* allocate space for the triangular factor information */
318       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
319 
320       /* Create the matrix description */
321       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
322       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
323       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
324       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
325       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
326 
327       /* Create the solve analysis information */
328       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
329 
330       /* set the operation */
331       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
332 
333       /* set the matrix */
334       loTriFactor->csrMat = new CsrMatrix;
335       loTriFactor->csrMat->num_rows = n;
336       loTriFactor->csrMat->num_cols = n;
337       loTriFactor->csrMat->num_entries = nzLower;
338 
339       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
340       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
341 
342       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
343       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
344 
345       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
346       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
347 
348       /* perform the solve analysis */
349       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
350                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
351                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
352                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
353 
354       /* assign the pointer. Is this really necessary? */
355       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
356 
357       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
358       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
359       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
360     } catch(char *ex) {
361       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
362     }
363   }
364   PetscFunctionReturn(0);
365 }
366 
367 #undef __FUNCT__
368 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
369 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
370 {
371   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
372   PetscInt                          n = A->rmap->n;
373   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
374   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
375   cusparseStatus_t                  stat;
376   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
377   const MatScalar                   *aa = a->a,*v;
378   PetscInt                          *AiUp, *AjUp;
379   PetscScalar                       *AAUp;
380   PetscInt                          i,nz, nzUpper, offset;
381   PetscErrorCode                    ierr;
382 
383   PetscFunctionBegin;
384   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
385     try {
386       /* next, figure out the number of nonzeros in the upper triangular matrix. */
387       nzUpper = adiag[0]-adiag[n];
388 
389       /* Allocate Space for the upper triangular matrix */
390       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
391       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
392       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
393 
394       /* Fill the upper triangular matrix */
395       AiUp[0]=(PetscInt) 0;
396       AiUp[n]=nzUpper;
397       offset = nzUpper;
398       for (i=n-1; i>=0; i--) {
399         v  = aa + adiag[i+1] + 1;
400         vi = aj + adiag[i+1] + 1;
401 
402         /* number of elements NOT on the diagonal */
403         nz = adiag[i] - adiag[i+1]-1;
404 
405         /* decrement the offset */
406         offset -= (nz+1);
407 
408         /* first, set the diagonal elements */
409         AjUp[offset] = (PetscInt) i;
410         AAUp[offset] = 1./v[nz];
411         AiUp[i]      = AiUp[i+1] - (nz+1);
412 
413         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
414         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
415       }
416 
417       /* allocate space for the triangular factor information */
418       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
419 
420       /* Create the matrix description */
421       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
422       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
423       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
424       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
425       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
426 
427       /* Create the solve analysis information */
428       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
429 
430       /* set the operation */
431       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
432 
433       /* set the matrix */
434       upTriFactor->csrMat = new CsrMatrix;
435       upTriFactor->csrMat->num_rows = n;
436       upTriFactor->csrMat->num_cols = n;
437       upTriFactor->csrMat->num_entries = nzUpper;
438 
439       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
440       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
441 
442       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
443       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
444 
445       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
446       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
447 
448       /* perform the solve analysis */
449       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
450                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
451                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
452                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
453 
454       /* assign the pointer. Is this really necessary? */
455       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
456 
457       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
458       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
459       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
460     } catch(char *ex) {
461       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
462     }
463   }
464   PetscFunctionReturn(0);
465 }
466 
467 #undef __FUNCT__
468 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
469 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
470 {
471   PetscErrorCode               ierr;
472   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
473   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
474   IS                           isrow = a->row,iscol = a->icol;
475   PetscBool                    row_identity,col_identity;
476   const PetscInt               *r,*c;
477   PetscInt                     n = A->rmap->n;
478 
479   PetscFunctionBegin;
480   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
481   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
482 
483   cusparseTriFactors->workVector = new THRUSTARRAY;
484   cusparseTriFactors->workVector->resize(n);
485   cusparseTriFactors->nnz=a->nz;
486 
487   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
488   /*lower triangular indices */
489   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
490   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
491   if (!row_identity) {
492     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
493     cusparseTriFactors->rpermIndices->assign(r, r+n);
494   }
495   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
496 
497   /*upper triangular indices */
498   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
499   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
500   if (!col_identity) {
501     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
502     cusparseTriFactors->cpermIndices->assign(c, c+n);
503   }
504   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
505   PetscFunctionReturn(0);
506 }
507 
508 #undef __FUNCT__
509 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
510 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
511 {
512   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
513   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
514   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
515   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
516   cusparseStatus_t                  stat;
517   PetscErrorCode                    ierr;
518   PetscInt                          *AiUp, *AjUp;
519   PetscScalar                       *AAUp;
520   PetscScalar                       *AALo;
521   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
522   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
523   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
524   const MatScalar                   *aa = b->a,*v;
525 
526   PetscFunctionBegin;
527   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
528     try {
529       /* Allocate Space for the upper triangular matrix */
530       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
531       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
532       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
533       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
534 
535       /* Fill the upper triangular matrix */
536       AiUp[0]=(PetscInt) 0;
537       AiUp[n]=nzUpper;
538       offset = 0;
539       for (i=0; i<n; i++) {
540         /* set the pointers */
541         v  = aa + ai[i];
542         vj = aj + ai[i];
543         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
544 
545         /* first, set the diagonal elements */
546         AjUp[offset] = (PetscInt) i;
547         AAUp[offset] = 1.0/v[nz];
548         AiUp[i]      = offset;
549         AALo[offset] = 1.0/v[nz];
550 
551         offset+=1;
552         if (nz>0) {
553           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
554           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
555           for (j=offset; j<offset+nz; j++) {
556             AAUp[j] = -AAUp[j];
557             AALo[j] = AAUp[j]/v[nz];
558           }
559           offset+=nz;
560         }
561       }
562 
563       /* allocate space for the triangular factor information */
564       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
565 
566       /* Create the matrix description */
567       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
568       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
569       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
570       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
571       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
572 
573       /* Create the solve analysis information */
574       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
575 
576       /* set the operation */
577       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
578 
579       /* set the matrix */
580       upTriFactor->csrMat = new CsrMatrix;
581       upTriFactor->csrMat->num_rows = A->rmap->n;
582       upTriFactor->csrMat->num_cols = A->cmap->n;
583       upTriFactor->csrMat->num_entries = a->nz;
584 
585       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
586       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
587 
588       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
589       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
590 
591       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
592       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
593 
594       /* perform the solve analysis */
595       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
596                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
597                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
598                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
599 
600       /* assign the pointer. Is this really necessary? */
601       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
602 
603       /* allocate space for the triangular factor information */
604       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
605 
606       /* Create the matrix description */
607       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
608       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
609       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
610       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
611       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
612 
613       /* Create the solve analysis information */
614       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
615 
616       /* set the operation */
617       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
618 
619       /* set the matrix */
620       loTriFactor->csrMat = new CsrMatrix;
621       loTriFactor->csrMat->num_rows = A->rmap->n;
622       loTriFactor->csrMat->num_cols = A->cmap->n;
623       loTriFactor->csrMat->num_entries = a->nz;
624 
625       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
626       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
627 
628       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
629       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
630 
631       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
632       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
633 
634       /* perform the solve analysis */
635       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
636                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
637                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
638                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
639 
640       /* assign the pointer. Is this really necessary? */
641       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
642 
643       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
644       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
645       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
646       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
647       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
648     } catch(char *ex) {
649       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650     }
651   }
652   PetscFunctionReturn(0);
653 }
654 
655 #undef __FUNCT__
656 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
657 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
658 {
659   PetscErrorCode               ierr;
660   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
661   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
662   IS                           ip = a->row;
663   const PetscInt               *rip;
664   PetscBool                    perm_identity;
665   PetscInt                     n = A->rmap->n;
666 
667   PetscFunctionBegin;
668   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
669   cusparseTriFactors->workVector = new THRUSTARRAY;
670   cusparseTriFactors->workVector->resize(n);
671   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
672 
673   /*lower triangular indices */
674   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
675   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
676   if (!perm_identity) {
677     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
678     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
679     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
680     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
681   }
682   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 #undef __FUNCT__
687 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
688 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
689 {
690   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
691   IS             isrow = b->row,iscol = b->col;
692   PetscBool      row_identity,col_identity;
693   PetscErrorCode ierr;
694 
695   PetscFunctionBegin;
696   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
697   /* determine which version of MatSolve needs to be used. */
698   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
699   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
700   if (row_identity && col_identity) {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
703   } else {
704     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
705     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
706   }
707 
708   /* get the triangular factors */
709   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
710   PetscFunctionReturn(0);
711 }
712 
713 #undef __FUNCT__
714 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
715 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
716 {
717   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
718   IS             ip = b->row;
719   PetscBool      perm_identity;
720   PetscErrorCode ierr;
721 
722   PetscFunctionBegin;
723   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
724 
725   /* determine which version of MatSolve needs to be used. */
726   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
727   if (perm_identity) {
728     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
729     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
730   } else {
731     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
732     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
733   }
734 
735   /* get the triangular factors */
736   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
737   PetscFunctionReturn(0);
738 }
739 
740 #undef __FUNCT__
741 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
742 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
743 {
744   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
745   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
746   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
747   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
748   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
749   cusparseStatus_t                  stat;
750   cusparseIndexBase_t               indexBase;
751   cusparseMatrixType_t              matrixType;
752   cusparseFillMode_t                fillMode;
753   cusparseDiagType_t                diagType;
754 
755   PetscFunctionBegin;
756 
757   /*********************************************/
758   /* Now the Transpose of the Lower Tri Factor */
759   /*********************************************/
760 
761   /* allocate space for the transpose of the lower triangular factor */
762   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
763 
764   /* set the matrix descriptors of the lower triangular factor */
765   matrixType = cusparseGetMatType(loTriFactor->descr);
766   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
767   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
768     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
769   diagType = cusparseGetMatDiagType(loTriFactor->descr);
770 
771   /* Create the matrix description */
772   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
773   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
774   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
775   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
776   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
777 
778   /* Create the solve analysis information */
779   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
780 
781   /* set the operation */
782   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
783 
784   /* allocate GPU space for the CSC of the lower triangular factor*/
785   loTriFactorT->csrMat = new CsrMatrix;
786   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
787   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
788   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
789   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
790   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
791   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
792 
793   /* compute the transpose of the lower triangular factor, i.e. the CSC */
794   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
795                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
796                           loTriFactor->csrMat->values->data().get(),
797                           loTriFactor->csrMat->row_offsets->data().get(),
798                           loTriFactor->csrMat->column_indices->data().get(),
799                           loTriFactorT->csrMat->values->data().get(),
800                           loTriFactorT->csrMat->column_indices->data().get(),
801                           loTriFactorT->csrMat->row_offsets->data().get(),
802                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
803 
804   /* perform the solve analysis on the transposed matrix */
805   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
806                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
807                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
808                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
809                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
810 
811   /* assign the pointer. Is this really necessary? */
812   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
813 
814   /*********************************************/
815   /* Now the Transpose of the Upper Tri Factor */
816   /*********************************************/
817 
818   /* allocate space for the transpose of the upper triangular factor */
819   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
820 
821   /* set the matrix descriptors of the upper triangular factor */
822   matrixType = cusparseGetMatType(upTriFactor->descr);
823   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
824   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
825     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
826   diagType = cusparseGetMatDiagType(upTriFactor->descr);
827 
828   /* Create the matrix description */
829   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
830   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
831   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
832   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
833   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
834 
835   /* Create the solve analysis information */
836   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
837 
838   /* set the operation */
839   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
840 
841   /* allocate GPU space for the CSC of the upper triangular factor*/
842   upTriFactorT->csrMat = new CsrMatrix;
843   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
844   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
845   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
846   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
847   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
848   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
849 
850   /* compute the transpose of the upper triangular factor, i.e. the CSC */
851   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
852                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
853                           upTriFactor->csrMat->values->data().get(),
854                           upTriFactor->csrMat->row_offsets->data().get(),
855                           upTriFactor->csrMat->column_indices->data().get(),
856                           upTriFactorT->csrMat->values->data().get(),
857                           upTriFactorT->csrMat->column_indices->data().get(),
858                           upTriFactorT->csrMat->row_offsets->data().get(),
859                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
860 
861   /* perform the solve analysis on the transposed matrix */
862   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
863                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
864                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
865                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
866                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
867 
868   /* assign the pointer. Is this really necessary? */
869   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
870   PetscFunctionReturn(0);
871 }
872 
873 #undef __FUNCT__
874 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
875 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
876 {
877   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
878   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
879   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
880   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
881   cusparseStatus_t             stat;
882   cusparseIndexBase_t          indexBase;
883   cudaError_t                  err;
884 
885   PetscFunctionBegin;
886 
887   /* allocate space for the triangular factor information */
888   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
889   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
890   indexBase = cusparseGetMatIndexBase(matstruct->descr);
891   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
892   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
893 
894   /* set alpha and beta */
895   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
896   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
897   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
898   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
899   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
900 
901   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
902     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
903     CsrMatrix *matrixT= new CsrMatrix;
904     matrixT->num_rows = A->rmap->n;
905     matrixT->num_cols = A->cmap->n;
906     matrixT->num_entries = a->nz;
907     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
908     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
909     matrixT->values = new THRUSTARRAY(a->nz);
910 
911     /* compute the transpose of the upper triangular factor, i.e. the CSC */
912     indexBase = cusparseGetMatIndexBase(matstruct->descr);
913     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
914                             matrix->num_cols, matrix->num_entries,
915                             matrix->values->data().get(),
916                             matrix->row_offsets->data().get(),
917                             matrix->column_indices->data().get(),
918                             matrixT->values->data().get(),
919                             matrixT->column_indices->data().get(),
920                             matrixT->row_offsets->data().get(),
921                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
922 
923     /* assign the pointer */
924     matstructT->mat = matrixT;
925 
926   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
927 
928     /* First convert HYB to CSR */
929     CsrMatrix *temp= new CsrMatrix;
930     temp->num_rows = A->rmap->n;
931     temp->num_cols = A->cmap->n;
932     temp->num_entries = a->nz;
933     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
934     temp->column_indices = new THRUSTINTARRAY32(a->nz);
935     temp->values = new THRUSTARRAY(a->nz);
936 
937     stat = cusparse_hyb2csr(cusparsestruct->handle,
938                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
939                             temp->values->data().get(),
940                             temp->row_offsets->data().get(),
941                             temp->column_indices->data().get());CHKERRCUSP(stat);
942 
943     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
944     CsrMatrix *tempT= new CsrMatrix;
945     tempT->num_rows = A->rmap->n;
946     tempT->num_cols = A->cmap->n;
947     tempT->num_entries = a->nz;
948     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
949     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
950     tempT->values = new THRUSTARRAY(a->nz);
951 
952     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
953                             temp->num_cols, temp->num_entries,
954                             temp->values->data().get(),
955                             temp->row_offsets->data().get(),
956                             temp->column_indices->data().get(),
957                             tempT->values->data().get(),
958                             tempT->column_indices->data().get(),
959                             tempT->row_offsets->data().get(),
960                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
961 
962     /* Last, convert CSC to HYB */
963     cusparseHybMat_t hybMat;
964     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
965     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
966       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
967     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
968                             matstructT->descr, tempT->values->data().get(),
969                             tempT->row_offsets->data().get(),
970                             tempT->column_indices->data().get(),
971                             hybMat, 0, partition);CHKERRCUSP(stat);
972 
973     /* assign the pointer */
974     matstructT->mat = hybMat;
975 
976     /* delete temporaries */
977     if (tempT) {
978       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
979       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
980       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
981       delete (CsrMatrix*) tempT;
982     }
983     if (temp) {
984       if (temp->values) delete (THRUSTARRAY*) temp->values;
985       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
986       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
987       delete (CsrMatrix*) temp;
988     }
989   }
990   /* assign the compressed row indices */
991   matstructT->cprowIndices = new THRUSTINTARRAY;
992 
993   /* assign the pointer */
994   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
995   PetscFunctionReturn(0);
996 }
997 
998 #undef __FUNCT__
999 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1000 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1001 {
1002   CUSPARRAY                         *xGPU, *bGPU;
1003   cusparseStatus_t                  stat;
1004   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1005   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1006   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1007   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1008   PetscErrorCode                    ierr;
1009 
1010   PetscFunctionBegin;
1011   /* Analyze the matrix and create the transpose ... on the fly */
1012   if (!loTriFactorT && !upTriFactorT) {
1013     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1014     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016   }
1017 
1018   /* Get the GPU pointers */
1019   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1020   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1021 
1022   /* First, reorder with the row permutation */
1023   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1024                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1025                xGPU->begin());
1026 
1027   /* First, solve U */
1028   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1029                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1030                         upTriFactorT->csrMat->values->data().get(),
1031                         upTriFactorT->csrMat->row_offsets->data().get(),
1032                         upTriFactorT->csrMat->column_indices->data().get(),
1033                         upTriFactorT->solveInfo,
1034                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1035 
1036   /* Then, solve L */
1037   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1038                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1039                         loTriFactorT->csrMat->values->data().get(),
1040                         loTriFactorT->csrMat->row_offsets->data().get(),
1041                         loTriFactorT->csrMat->column_indices->data().get(),
1042                         loTriFactorT->solveInfo,
1043                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1044 
1045   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1046   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1047                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1048                tempGPU->begin());
1049 
1050   /* Copy the temporary to the full solution. */
1051   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1052 
1053   /* restore */
1054   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1055   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1056   ierr = WaitForGPU();CHKERRCUSP(ierr);
1057 
1058   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 #undef __FUNCT__
1063 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1064 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1065 {
1066   CUSPARRAY                         *xGPU,*bGPU;
1067   cusparseStatus_t                  stat;
1068   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1070   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1071   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1072   PetscErrorCode                    ierr;
1073 
1074   PetscFunctionBegin;
1075   /* Analyze the matrix and create the transpose ... on the fly */
1076   if (!loTriFactorT && !upTriFactorT) {
1077     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1078     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080   }
1081 
1082   /* Get the GPU pointers */
1083   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1084   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1085 
1086   /* First, solve U */
1087   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1088                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1089                         upTriFactorT->csrMat->values->data().get(),
1090                         upTriFactorT->csrMat->row_offsets->data().get(),
1091                         upTriFactorT->csrMat->column_indices->data().get(),
1092                         upTriFactorT->solveInfo,
1093                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1094 
1095   /* Then, solve L */
1096   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1097                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1098                         loTriFactorT->csrMat->values->data().get(),
1099                         loTriFactorT->csrMat->row_offsets->data().get(),
1100                         loTriFactorT->csrMat->column_indices->data().get(),
1101                         loTriFactorT->solveInfo,
1102                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1103 
1104   /* restore */
1105   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1106   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1107   ierr = WaitForGPU();CHKERRCUSP(ierr);
1108   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 #undef __FUNCT__
1113 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1115 {
1116   CUSPARRAY                         *xGPU,*bGPU;
1117   cusparseStatus_t                  stat;
1118   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1119   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1120   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1121   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1122   PetscErrorCode                    ierr;
1123 
1124   PetscFunctionBegin;
1125   /* Get the GPU pointers */
1126   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1127   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1128 
1129   /* First, reorder with the row permutation */
1130   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1131                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1132                xGPU->begin());
1133 
1134   /* Next, solve L */
1135   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1136                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1137                         loTriFactor->csrMat->values->data().get(),
1138                         loTriFactor->csrMat->row_offsets->data().get(),
1139                         loTriFactor->csrMat->column_indices->data().get(),
1140                         loTriFactor->solveInfo,
1141                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1142 
1143   /* Then, solve U */
1144   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1145                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1146                         upTriFactor->csrMat->values->data().get(),
1147                         upTriFactor->csrMat->row_offsets->data().get(),
1148                         upTriFactor->csrMat->column_indices->data().get(),
1149                         upTriFactor->solveInfo,
1150                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1151 
1152   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1153   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1154                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1155                tempGPU->begin());
1156 
1157   /* Copy the temporary to the full solution. */
1158   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1159 
1160   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1161   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1162   ierr = WaitForGPU();CHKERRCUSP(ierr);
1163   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 #undef __FUNCT__
1168 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1169 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1170 {
1171   CUSPARRAY                         *xGPU,*bGPU;
1172   cusparseStatus_t                  stat;
1173   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1174   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1175   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1176   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1177   PetscErrorCode                    ierr;
1178 
1179   PetscFunctionBegin;
1180   /* Get the GPU pointers */
1181   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1182   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1183 
1184   /* First, solve L */
1185   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1186                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1187                         loTriFactor->csrMat->values->data().get(),
1188                         loTriFactor->csrMat->row_offsets->data().get(),
1189                         loTriFactor->csrMat->column_indices->data().get(),
1190                         loTriFactor->solveInfo,
1191                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1192 
1193   /* Next, solve U */
1194   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1195                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1196                         upTriFactor->csrMat->values->data().get(),
1197                         upTriFactor->csrMat->row_offsets->data().get(),
1198                         upTriFactor->csrMat->column_indices->data().get(),
1199                         upTriFactor->solveInfo,
1200                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1201 
1202   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1203   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1204   ierr = WaitForGPU();CHKERRCUSP(ierr);
1205   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1206   PetscFunctionReturn(0);
1207 }
1208 
1209 #undef __FUNCT__
1210 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1211 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1212 {
1213 
1214   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1215   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1216   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1217   PetscInt                     m = A->rmap->n,*ii,*ridx;
1218   PetscErrorCode               ierr;
1219   cusparseStatus_t             stat;
1220   cudaError_t                  err;
1221 
1222   PetscFunctionBegin;
1223   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1224     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1225     if (matstruct) {
1226       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1227     }
1228     try {
1229       cusparsestruct->nonzerorow=0;
1230       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1231 
1232       if (a->compressedrow.use) {
1233         m    = a->compressedrow.nrows;
1234         ii   = a->compressedrow.i;
1235         ridx = a->compressedrow.rindex;
1236       } else {
1237         /* Forcing compressed row on the GPU */
1238         int k=0;
1239         ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
1240         ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
1241         ii[0]=0;
1242         for (int j = 0; j<m; j++) {
1243           if ((a->i[j+1]-a->i[j])>0) {
1244             ii[k]  = a->i[j];
1245             ridx[k]= j;
1246             k++;
1247           }
1248         }
1249         ii[cusparsestruct->nonzerorow] = a->nz;
1250         m = cusparsestruct->nonzerorow;
1251       }
1252 
1253       /* allocate space for the triangular factor information */
1254       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1255       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1256       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1257       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1258 
1259       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1260       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1261       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1262       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1263       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1264 
1265       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1266       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1267         /* set the matrix */
1268 	CsrMatrix *matrix= new CsrMatrix;
1269 	matrix->num_rows = A->rmap->n;
1270 	matrix->num_cols = A->cmap->n;
1271 	matrix->num_entries = a->nz;
1272 	matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1273 	matrix->row_offsets->assign(ii, ii + cusparsestruct->nonzerorow+1);
1274 
1275 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1276 	matrix->column_indices->assign(a->j, a->j+a->nz);
1277 
1278 	matrix->values = new THRUSTARRAY(a->nz);
1279 	matrix->values->assign(a->a, a->a+a->nz);
1280 
1281         /* assign the pointer */
1282         matstruct->mat = matrix;
1283 
1284       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1285 
1286 	CsrMatrix *matrix= new CsrMatrix;
1287 	matrix->num_rows = A->rmap->n;
1288 	matrix->num_cols = A->cmap->n;
1289 	matrix->num_entries = a->nz;
1290 	matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291 	matrix->row_offsets->assign(ii, ii + cusparsestruct->nonzerorow+1);
1292 
1293 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1294 	matrix->column_indices->assign(a->j, a->j+a->nz);
1295 
1296 	matrix->values = new THRUSTARRAY(a->nz);
1297 	matrix->values->assign(a->a, a->a+a->nz);
1298 
1299         cusparseHybMat_t hybMat;
1300         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1301         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1302           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1303         stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1304                                 matstruct->descr, matrix->values->data().get(),
1305                                 matrix->row_offsets->data().get(),
1306                                 matrix->column_indices->data().get(),
1307                                 hybMat, 0, partition);CHKERRCUSP(stat);
1308         /* assign the pointer */
1309         matstruct->mat = hybMat;
1310 
1311 	if (matrix) {
1312 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1313 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1314 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1315 	  delete (CsrMatrix*)matrix;
1316 	}
1317       }
1318 
1319       /* assign the compressed row indices */
1320       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1321       matstruct->cprowIndices->assign(ridx,ridx+m);
1322 
1323       /* assign the pointer */
1324       cusparsestruct->mat = matstruct;
1325 
1326       if (!a->compressedrow.use) {
1327         ierr = PetscFree(ii);CHKERRQ(ierr);
1328         ierr = PetscFree(ridx);CHKERRQ(ierr);
1329       }
1330       cusparsestruct->workVector = new THRUSTARRAY;
1331       cusparsestruct->workVector->resize(m);
1332     } catch(char *ex) {
1333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1334     }
1335     ierr = WaitForGPU();CHKERRCUSP(ierr);
1336 
1337     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1338 
1339     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1340   }
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 #undef __FUNCT__
1345 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
1346 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1347 {
1348   PetscErrorCode ierr;
1349 
1350   PetscFunctionBegin;
1351   if (right) {
1352     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1353     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1354     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
1355     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1356     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1357   }
1358   if (left) {
1359     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1360     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1361     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
1362     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1363     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1364   }
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 struct VecCUSPPlusEquals
1369 {
1370   template <typename Tuple>
1371   __host__ __device__
1372   void operator()(Tuple t)
1373   {
1374     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1375   }
1376 };
1377 
1378 #undef __FUNCT__
1379 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1380 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1381 {
1382   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1383   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1384   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1385   CUSPARRAY                    *xarray,*yarray;
1386   PetscErrorCode               ierr;
1387   cusparseStatus_t             stat;
1388 
1389   PetscFunctionBegin;
1390   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1391      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1392   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1393   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1394   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1395     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1396     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1397                              mat->num_rows, mat->num_cols, mat->num_entries,
1398                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1399                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1400                              yarray->data().get());CHKERRCUSP(stat);
1401   } else {
1402     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1403     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1404                              matstruct->alpha, matstruct->descr, hybMat,
1405                              xarray->data().get(), matstruct->beta,
1406                              yarray->data().get());CHKERRCUSP(stat);
1407   }
1408   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1409   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1410   if (!cusparsestruct->stream) {
1411     ierr = WaitForGPU();CHKERRCUSP(ierr);
1412   }
1413   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1414   PetscFunctionReturn(0);
1415 }
1416 
1417 #undef __FUNCT__
1418 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1419 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1420 {
1421   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1422   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1423   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1424   CUSPARRAY                    *xarray,*yarray;
1425   PetscErrorCode               ierr;
1426   cusparseStatus_t             stat;
1427 
1428   PetscFunctionBegin;
1429   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1430      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1431   if (!matstructT) {
1432     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1433     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1434   }
1435   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1436   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1437 
1438   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1439     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1440     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1441                              mat->num_rows, mat->num_cols,
1442                              mat->num_entries, matstructT->alpha, matstructT->descr,
1443                              mat->values->data().get(), mat->row_offsets->data().get(),
1444                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1445                              yarray->data().get());CHKERRCUSP(stat);
1446   } else {
1447     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1448     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1449                              matstructT->alpha, matstructT->descr, hybMat,
1450                              xarray->data().get(), matstructT->beta,
1451                              yarray->data().get());CHKERRCUSP(stat);
1452   }
1453   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1454   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1455   if (!cusparsestruct->stream) {
1456     ierr = WaitForGPU();CHKERRCUSP(ierr);
1457   }
1458   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1459   PetscFunctionReturn(0);
1460 }
1461 
1462 
1463 #undef __FUNCT__
1464 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1465 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1466 {
1467   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1468   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1469   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1470   CUSPARRAY                    *xarray,*yarray,*zarray;
1471   PetscErrorCode               ierr;
1472   cusparseStatus_t             stat;
1473 
1474   PetscFunctionBegin;
1475   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1476      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1477   try {
1478     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1479     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1480     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1481     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1482 
1483     /* multiply add */
1484     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1485       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1486       /* here we need to be careful to set the number of rows in the multiply to the
1487 	 number of compressed rows in the matrix ... which is equivalent to the
1488 	 size of the workVector */
1489       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1490                                cusparsestruct->workVector->size(), mat->num_cols,
1491                                mat->num_entries, matstruct->alpha, matstruct->descr,
1492                                mat->values->data().get(), mat->row_offsets->data().get(),
1493                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1494                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1495     } else {
1496       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1497       stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1498 			       matstruct->alpha, matstruct->descr, hybMat,
1499 			       xarray->data().get(), matstruct->beta,
1500 			       cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1501     }
1502 
1503     /* scatter the data from the temporary into the full vector with a += operation */
1504     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1505 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1506 		     VecCUSPPlusEquals());
1507     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1508     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1509     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1510 
1511   } catch(char *ex) {
1512     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1513   }
1514   ierr = WaitForGPU();CHKERRCUSP(ierr);
1515   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 #undef __FUNCT__
1520 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1521 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1522 {
1523   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1524   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1525   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1526   CUSPARRAY                    *xarray,*yarray,*zarray;
1527   PetscErrorCode               ierr;
1528   cusparseStatus_t             stat;
1529 
1530   PetscFunctionBegin;
1531   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1532      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1533   if (!matstructT) {
1534     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1535     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1536   }
1537 
1538   try {
1539     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1540     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1541     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1542     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1543 
1544     /* multiply add with matrix transpose */
1545     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1546       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1547       /* here we need to be careful to set the number of rows in the multiply to the
1548 	 number of compressed rows in the matrix ... which is equivalent to the
1549 	 size of the workVector */
1550       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1551                                cusparsestruct->workVector->size(), mat->num_cols,
1552 			       mat->num_entries, matstructT->alpha, matstructT->descr,
1553                                mat->values->data().get(), mat->row_offsets->data().get(),
1554                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1555                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1556     } else {
1557       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1558       stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1559 			       matstructT->alpha, matstructT->descr, hybMat,
1560 			       xarray->data().get(), matstructT->beta,
1561 			       cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1562     }
1563 
1564     /* scatter the data from the temporary into the full vector with a += operation */
1565     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1566 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1567 		     VecCUSPPlusEquals());
1568 
1569     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1570     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1571     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1572 
1573   } catch(char *ex) {
1574     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1575   }
1576   ierr = WaitForGPU();CHKERRCUSP(ierr);
1577   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 #undef __FUNCT__
1582 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1583 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1584 {
1585   PetscErrorCode ierr;
1586 
1587   PetscFunctionBegin;
1588   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1589   if (A->factortype==MAT_FACTOR_NONE) {
1590     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1591   }
1592   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1593   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1594   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1595   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1596   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1597   PetscFunctionReturn(0);
1598 }
1599 
1600 /* --------------------------------------------------------------------------------*/
1601 #undef __FUNCT__
1602 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1603 /*@
1604    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1605    (the default parallel PETSc format). This matrix will ultimately pushed down
1606    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1607    assembly performance the user should preallocate the matrix storage by setting
1608    the parameter nz (or the array nnz).  By setting these parameters accurately,
1609    performance during matrix assembly can be increased by more than a factor of 50.
1610 
1611    Collective on MPI_Comm
1612 
1613    Input Parameters:
1614 +  comm - MPI communicator, set to PETSC_COMM_SELF
1615 .  m - number of rows
1616 .  n - number of columns
1617 .  nz - number of nonzeros per row (same for all rows)
1618 -  nnz - array containing the number of nonzeros in the various rows
1619          (possibly different for each row) or NULL
1620 
1621    Output Parameter:
1622 .  A - the matrix
1623 
1624    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1625    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1626    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1627 
1628    Notes:
1629    If nnz is given then nz is ignored
1630 
1631    The AIJ format (also called the Yale sparse matrix format or
1632    compressed row storage), is fully compatible with standard Fortran 77
1633    storage.  That is, the stored row and column indices can begin at
1634    either one (as in Fortran) or zero.  See the users' manual for details.
1635 
1636    Specify the preallocated storage with either nz or nnz (not both).
1637    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1638    allocation.  For large problems you MUST preallocate memory or you
1639    will get TERRIBLE performance, see the users' manual chapter on matrices.
1640 
1641    By default, this format uses inodes (identical nodes) when possible, to
1642    improve numerical efficiency of matrix-vector products and solves. We
1643    search for consecutive rows with the same nonzero structure, thereby
1644    reusing matrix information to achieve increased efficiency.
1645 
1646    Level: intermediate
1647 
1648 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1649 @*/
1650 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1651 {
1652   PetscErrorCode ierr;
1653 
1654   PetscFunctionBegin;
1655   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1656   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1657   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1658   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1659   PetscFunctionReturn(0);
1660 }
1661 
1662 #undef __FUNCT__
1663 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1664 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1665 {
1666   PetscErrorCode   ierr;
1667   cusparseStatus_t stat;
1668   cudaError_t      err;
1669   PetscFunctionBegin;
1670   if (A->factortype==MAT_FACTOR_NONE) {
1671     try {
1672       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1673         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1674         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1675         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1676 
1677         /* delete all the members in each of the data structures */
1678         if (matstruct) {
1679           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1680           if (matstruct->mat) {
1681             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1682               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1683               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1684             } else {
1685 	      CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1686 	      if (mat->values) delete (THRUSTARRAY*)mat->values;
1687 	      if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1688 	      if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1689               delete (CsrMatrix*)matstruct->mat;
1690             }
1691           }
1692 	  if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1693 	  if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1694           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1695           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1696         }
1697         if (matstructT) {
1698           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1699           if (matstructT->mat) {
1700             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1701               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1702               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1703             } else {
1704 	      CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1705 	      if (matT->values) delete (THRUSTARRAY*)matT->values;
1706 	      if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1707 	      if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1708               delete (CsrMatrix*)matstructT->mat;
1709             }
1710           }
1711 	  if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1712 	  if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1713           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1714           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1715         }
1716         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1717         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1718         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1719         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1720       } else {
1721         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1722         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1723         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1724       }
1725     } catch(char *ex) {
1726       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1727     }
1728   } else {
1729     /* The triangular factors */
1730     try {
1731       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1732       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1733       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1734       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1735       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1736 
1737       /* delete all the members in each of the data structures */
1738       if (loTriFactor) {
1739         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1740         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1741         if (loTriFactor->csrMat) {
1742 	  if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1743 	  if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1744 	  if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1745 	  delete (CsrMatrix*)loTriFactor->csrMat;
1746 	}
1747         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1748       }
1749 
1750       if (upTriFactor) {
1751         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1752         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1753         if (upTriFactor->csrMat) {
1754 	  if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1755 	  if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1756 	  if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1757 	  delete (CsrMatrix*)upTriFactor->csrMat;
1758 	}
1759         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1760       }
1761 
1762       if (loTriFactorT) {
1763         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1764         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1765         if (loTriFactorT->csrMat) {
1766 	  if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1767 	  if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1768 	  if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1769 	  delete (CsrMatrix*)loTriFactorT->csrMat;
1770 	}
1771         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1772       }
1773 
1774       if (upTriFactorT) {
1775         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1776         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1777         if (upTriFactorT->csrMat) {
1778 	  if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1779 	  if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1780 	  if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1781 	  delete (CsrMatrix*)upTriFactorT->csrMat;
1782 	}
1783         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1784       }
1785       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1786       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1787       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1788       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1789       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1790     } catch(char *ex) {
1791       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1792     }
1793   }
1794   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1795   A->spptr = 0;
1796 
1797   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1798   PetscFunctionReturn(0);
1799 }
1800 
1801 #undef __FUNCT__
1802 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1803 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1804 {
1805   PetscErrorCode ierr;
1806   cusparseStatus_t stat;
1807   cusparseHandle_t handle=0;
1808 
1809   PetscFunctionBegin;
1810   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1811   if (B->factortype==MAT_FACTOR_NONE) {
1812     /* you cannot check the inode.use flag here since the matrix was just created.
1813        now build a GPU matrix data structure */
1814     B->spptr = new Mat_SeqAIJCUSPARSE;
1815     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1816     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1817     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1818     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1819     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1820     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1821     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1822     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1823     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1824   } else {
1825     /* NEXT, set the pointers to the triangular factors */
1826     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1827     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1828     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1829     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1830     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1831     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1832     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1833     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1834     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format                  = cusparseMatSolveStorageFormat;
1835     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1836     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1837     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1838     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1839   }
1840 
1841   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1842      default cusparse tri solve. Note the difference with the implementation in
1843      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1844   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1845 
1846   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1847   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1848   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1849   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1850   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1851   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1852   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1853   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1854 
1855   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1856 
1857   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1858 
1859   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1860   PetscFunctionReturn(0);
1861 }
1862 
1863 /*M
1864    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1865 
1866    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1867    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1868    the CUSPARSE library.
1869 
1870    Options Database Keys:
1871 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1872 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1873 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1874 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1875 
1876   Level: beginner
1877 
1878 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1879 M*/
1880