xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 1263af9a6fcf8127cd2b5a60e5a615e689f50791)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include <petscconf.h>
7 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include <../src/vec/vec/impls/dvecimpl.h>
10 #include <petsc-private/vecimpl.h>
11 #undef VecType
12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 #undef __FUNCT__
35 #define __FUNCT__ "MatCUSPARSESetStream"
36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
37 {
38   cusparseStatus_t   stat;
39   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
40 
41   PetscFunctionBegin;
42   cusparsestruct->stream = stream;
43   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
44   PetscFunctionReturn(0);
45 }
46 
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCUSPARSESetHandle"
49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
50 {
51   cusparseStatus_t   stat;
52   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53 
54   PetscFunctionBegin;
55   if (cusparsestruct->handle)
56     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
57   cusparsestruct->handle = handle;
58   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
59   PetscFunctionReturn(0);
60 }
61 
62 #undef __FUNCT__
63 #define __FUNCT__ "MatCUSPARSEClearHandle"
64 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
65 {
66   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67   PetscFunctionBegin;
68   if (cusparsestruct->handle)
69     cusparsestruct->handle = 0;
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
76 {
77   PetscFunctionBegin;
78   *type = MATSOLVERCUSPARSE;
79   PetscFunctionReturn(0);
80 }
81 
82 /*MC
83   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88   algorithms are not recommended. This class does NOT support direct solver operations.
89 
90   Level: beginner
91 
92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
93 M*/
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 #undef __FUNCT__
123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129 #if CUDA_VERSION>=4020
130   switch (op) {
131   case MAT_CUSPARSE_MULT:
132     cusparsestruct->format = format;
133     break;
134   case MAT_CUSPARSE_ALL:
135     cusparsestruct->format = format;
136     break;
137   default:
138     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139   }
140 #else
141   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
142     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
143 #endif
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149    operation. Only the MatMult operation can use different GPU storage formats
150    for MPIAIJCUSPARSE matrices.
151    Not Collective
152 
153    Input Parameters:
154 +  A - Matrix of type SEQAIJCUSPARSE
155 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
157 
158    Output Parameter:
159 
160    Level: intermediate
161 
162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163 @*/
164 #undef __FUNCT__
165 #define __FUNCT__ "MatCUSPARSESetFormat"
166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
167 {
168   PetscErrorCode ierr;
169 
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
172   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
179 {
180   PetscErrorCode           ierr;
181   MatCUSPARSEStorageFormat format;
182   PetscBool                flg;
183 
184   PetscFunctionBegin;
185   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
186   ierr = PetscObjectOptionsBegin((PetscObject)A);
187   if (A->factortype==MAT_FACTOR_NONE) {
188     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
189                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190     if (flg) {
191       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
192     }
193   }
194   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
195                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
196   if (flg) {
197     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
198   }
199   ierr = PetscOptionsEnd();CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 
202 }
203 
204 #undef __FUNCT__
205 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
206 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
207 {
208   PetscErrorCode ierr;
209 
210   PetscFunctionBegin;
211   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
212   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
218 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
219 {
220   PetscErrorCode ierr;
221 
222   PetscFunctionBegin;
223   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
224   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
230 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
236   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
237   PetscFunctionReturn(0);
238 }
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
242 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
243 {
244   PetscErrorCode ierr;
245 
246   PetscFunctionBegin;
247   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
248   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
254 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
255 {
256   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
257   PetscInt                          n = A->rmap->n;
258   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
259   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
260   cusparseStatus_t                  stat;
261   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
262   const MatScalar                   *aa = a->a,*v;
263   PetscInt                          *AiLo, *AjLo;
264   PetscScalar                       *AALo;
265   PetscInt                          i,nz, nzLower, offset, rowOffset;
266   PetscErrorCode                    ierr;
267 
268   PetscFunctionBegin;
269   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
270     try {
271       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
272       nzLower=n+ai[n]-ai[1];
273 
274       /* Allocate Space for the lower triangular matrix */
275       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
276       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
277       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
278 
279       /* Fill the lower triangular matrix */
280       AiLo[0]  = (PetscInt) 0;
281       AiLo[n]  = nzLower;
282       AjLo[0]  = (PetscInt) 0;
283       AALo[0]  = (MatScalar) 1.0;
284       v        = aa;
285       vi       = aj;
286       offset   = 1;
287       rowOffset= 1;
288       for (i=1; i<n; i++) {
289         nz = ai[i+1] - ai[i];
290         /* additional 1 for the term on the diagonal */
291         AiLo[i]    = rowOffset;
292         rowOffset += nz+1;
293 
294         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
295         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
296 
297         offset      += nz;
298         AjLo[offset] = (PetscInt) i;
299         AALo[offset] = (MatScalar) 1.0;
300         offset      += 1;
301 
302         v  += nz;
303         vi += nz;
304       }
305 
306       /* allocate space for the triangular factor information */
307       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
308 
309       /* Create the matrix description */
310       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
311       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
312       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
313       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
314       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
315 
316       /* Create the solve analysis information */
317       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
318 
319       /* set the operation */
320       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
321 
322       /* set the matrix */
323       loTriFactor->csrMat = new CsrMatrix;
324       loTriFactor->csrMat->num_rows = n;
325       loTriFactor->csrMat->num_cols = n;
326       loTriFactor->csrMat->num_entries = nzLower;
327 
328       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
329       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
330 
331       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
332       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
333 
334       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
335       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
336 
337       /* perform the solve analysis */
338       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
339                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
340                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
341                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
342 
343       /* assign the pointer. Is this really necessary? */
344       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
345 
346       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
347       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
348       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
349     } catch(char *ex) {
350       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
351     }
352   }
353   PetscFunctionReturn(0);
354 }
355 
356 #undef __FUNCT__
357 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
358 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
359 {
360   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
361   PetscInt                          n = A->rmap->n;
362   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
363   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
364   cusparseStatus_t                  stat;
365   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
366   const MatScalar                   *aa = a->a,*v;
367   PetscInt                          *AiUp, *AjUp;
368   PetscScalar                       *AAUp;
369   PetscInt                          i,nz, nzUpper, offset;
370   PetscErrorCode                    ierr;
371 
372   PetscFunctionBegin;
373   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
374     try {
375       /* next, figure out the number of nonzeros in the upper triangular matrix. */
376       nzUpper = adiag[0]-adiag[n];
377 
378       /* Allocate Space for the upper triangular matrix */
379       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
380       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
381       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
382 
383       /* Fill the upper triangular matrix */
384       AiUp[0]=(PetscInt) 0;
385       AiUp[n]=nzUpper;
386       offset = nzUpper;
387       for (i=n-1; i>=0; i--) {
388         v  = aa + adiag[i+1] + 1;
389         vi = aj + adiag[i+1] + 1;
390 
391         /* number of elements NOT on the diagonal */
392         nz = adiag[i] - adiag[i+1]-1;
393 
394         /* decrement the offset */
395         offset -= (nz+1);
396 
397         /* first, set the diagonal elements */
398         AjUp[offset] = (PetscInt) i;
399         AAUp[offset] = 1./v[nz];
400         AiUp[i]      = AiUp[i+1] - (nz+1);
401 
402         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
403         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
404       }
405 
406       /* allocate space for the triangular factor information */
407       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
408 
409       /* Create the matrix description */
410       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
411       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
412       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
413       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
414       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
415 
416       /* Create the solve analysis information */
417       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
418 
419       /* set the operation */
420       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
421 
422       /* set the matrix */
423       upTriFactor->csrMat = new CsrMatrix;
424       upTriFactor->csrMat->num_rows = n;
425       upTriFactor->csrMat->num_cols = n;
426       upTriFactor->csrMat->num_entries = nzUpper;
427 
428       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
430 
431       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
432       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
433 
434       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
435       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
436 
437       /* perform the solve analysis */
438       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
439                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
440                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
441                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
442 
443       /* assign the pointer. Is this really necessary? */
444       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
445 
446       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
447       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
448       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
449     } catch(char *ex) {
450       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
451     }
452   }
453   PetscFunctionReturn(0);
454 }
455 
456 #undef __FUNCT__
457 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
458 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
459 {
460   PetscErrorCode               ierr;
461   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
462   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
463   IS                           isrow = a->row,iscol = a->icol;
464   PetscBool                    row_identity,col_identity;
465   const PetscInt               *r,*c;
466   PetscInt                     n = A->rmap->n;
467 
468   PetscFunctionBegin;
469   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
470   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
471 
472   cusparseTriFactors->workVector = new THRUSTARRAY;
473   cusparseTriFactors->workVector->resize(n);
474   cusparseTriFactors->nnz=a->nz;
475 
476   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
477   /*lower triangular indices */
478   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
479   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
480   if (!row_identity) {
481     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
482     cusparseTriFactors->rpermIndices->assign(r, r+n);
483   }
484   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
485 
486   /*upper triangular indices */
487   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
488   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
489   if (!col_identity) {
490     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
491     cusparseTriFactors->cpermIndices->assign(c, c+n);
492   }
493   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
494   PetscFunctionReturn(0);
495 }
496 
497 #undef __FUNCT__
498 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
499 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
500 {
501   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
502   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
503   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
504   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
505   cusparseStatus_t                  stat;
506   PetscErrorCode                    ierr;
507   PetscInt                          *AiUp, *AjUp;
508   PetscScalar                       *AAUp;
509   PetscScalar                       *AALo;
510   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
511   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
512   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
513   const MatScalar                   *aa = b->a,*v;
514 
515   PetscFunctionBegin;
516   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
517     try {
518       /* Allocate Space for the upper triangular matrix */
519       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
520       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
521       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
522       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
523 
524       /* Fill the upper triangular matrix */
525       AiUp[0]=(PetscInt) 0;
526       AiUp[n]=nzUpper;
527       offset = 0;
528       for (i=0; i<n; i++) {
529         /* set the pointers */
530         v  = aa + ai[i];
531         vj = aj + ai[i];
532         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
533 
534         /* first, set the diagonal elements */
535         AjUp[offset] = (PetscInt) i;
536         AAUp[offset] = 1.0/v[nz];
537         AiUp[i]      = offset;
538         AALo[offset] = 1.0/v[nz];
539 
540         offset+=1;
541         if (nz>0) {
542           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
543           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
544           for (j=offset; j<offset+nz; j++) {
545             AAUp[j] = -AAUp[j];
546             AALo[j] = AAUp[j]/v[nz];
547           }
548           offset+=nz;
549         }
550       }
551 
552       /* allocate space for the triangular factor information */
553       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
554 
555       /* Create the matrix description */
556       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
557       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
558       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
559       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
560       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
561 
562       /* Create the solve analysis information */
563       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
564 
565       /* set the operation */
566       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
567 
568       /* set the matrix */
569       upTriFactor->csrMat = new CsrMatrix;
570       upTriFactor->csrMat->num_rows = A->rmap->n;
571       upTriFactor->csrMat->num_cols = A->cmap->n;
572       upTriFactor->csrMat->num_entries = a->nz;
573 
574       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
575       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
576 
577       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
578       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
579 
580       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
581       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
582 
583       /* perform the solve analysis */
584       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
585                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
586                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
587                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
588 
589       /* assign the pointer. Is this really necessary? */
590       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
591 
592       /* allocate space for the triangular factor information */
593       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
594 
595       /* Create the matrix description */
596       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
597       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
598       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
599       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
600       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
601 
602       /* Create the solve analysis information */
603       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
604 
605       /* set the operation */
606       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
607 
608       /* set the matrix */
609       loTriFactor->csrMat = new CsrMatrix;
610       loTriFactor->csrMat->num_rows = A->rmap->n;
611       loTriFactor->csrMat->num_cols = A->cmap->n;
612       loTriFactor->csrMat->num_entries = a->nz;
613 
614       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
615       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
616 
617       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
618       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
619 
620       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
621       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
622 
623       /* perform the solve analysis */
624       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
625                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
626                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
627                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
628 
629       /* assign the pointer. Is this really necessary? */
630       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
631 
632       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
633       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
634       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
635       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
636       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
637     } catch(char *ex) {
638       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
639     }
640   }
641   PetscFunctionReturn(0);
642 }
643 
644 #undef __FUNCT__
645 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
646 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
647 {
648   PetscErrorCode               ierr;
649   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
650   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
651   IS                           ip = a->row;
652   const PetscInt               *rip;
653   PetscBool                    perm_identity;
654   PetscInt                     n = A->rmap->n;
655 
656   PetscFunctionBegin;
657   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
658   cusparseTriFactors->workVector = new THRUSTARRAY;
659   cusparseTriFactors->workVector->resize(n);
660   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
661 
662   /*lower triangular indices */
663   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
664   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
665   if (!perm_identity) {
666     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
667     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
668     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
669     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
670   }
671   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
672   PetscFunctionReturn(0);
673 }
674 
675 #undef __FUNCT__
676 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
677 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
678 {
679   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
680   IS             isrow = b->row,iscol = b->col;
681   PetscBool      row_identity,col_identity;
682   PetscErrorCode ierr;
683 
684   PetscFunctionBegin;
685   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
686   /* determine which version of MatSolve needs to be used. */
687   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
688   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
689   if (row_identity && col_identity) {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
692   } else {
693     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
694     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
695   }
696 
697   /* get the triangular factors */
698   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
705 {
706   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
707   IS             ip = b->row;
708   PetscBool      perm_identity;
709   PetscErrorCode ierr;
710 
711   PetscFunctionBegin;
712   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
713 
714   /* determine which version of MatSolve needs to be used. */
715   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
716   if (perm_identity) {
717     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
718     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
719   } else {
720     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
721     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
722   }
723 
724   /* get the triangular factors */
725   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 
729 #undef __FUNCT__
730 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
731 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
732 {
733   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
734   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
736   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
737   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
738   cusparseStatus_t                  stat;
739   cusparseIndexBase_t               indexBase;
740   cusparseMatrixType_t              matrixType;
741   cusparseFillMode_t                fillMode;
742   cusparseDiagType_t                diagType;
743 
744   PetscFunctionBegin;
745 
746   /*********************************************/
747   /* Now the Transpose of the Lower Tri Factor */
748   /*********************************************/
749 
750   /* allocate space for the transpose of the lower triangular factor */
751   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
752 
753   /* set the matrix descriptors of the lower triangular factor */
754   matrixType = cusparseGetMatType(loTriFactor->descr);
755   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
756   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
757     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
758   diagType = cusparseGetMatDiagType(loTriFactor->descr);
759 
760   /* Create the matrix description */
761   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
762   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
763   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
764   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
765   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
766 
767   /* Create the solve analysis information */
768   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
769 
770   /* set the operation */
771   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
772 
773   /* allocate GPU space for the CSC of the lower triangular factor*/
774   loTriFactorT->csrMat = new CsrMatrix;
775   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
776   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
777   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
778   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
779   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
780   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
781 
782   /* compute the transpose of the lower triangular factor, i.e. the CSC */
783   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
784                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
785                           loTriFactor->csrMat->values->data().get(),
786                           loTriFactor->csrMat->row_offsets->data().get(),
787                           loTriFactor->csrMat->column_indices->data().get(),
788                           loTriFactorT->csrMat->values->data().get(),
789                           loTriFactorT->csrMat->column_indices->data().get(),
790                           loTriFactorT->csrMat->row_offsets->data().get(),
791                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
792 
793   /* perform the solve analysis on the transposed matrix */
794   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
795                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
796                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
797                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
798                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
799 
800   /* assign the pointer. Is this really necessary? */
801   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
802 
803   /*********************************************/
804   /* Now the Transpose of the Upper Tri Factor */
805   /*********************************************/
806 
807   /* allocate space for the transpose of the upper triangular factor */
808   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
809 
810   /* set the matrix descriptors of the upper triangular factor */
811   matrixType = cusparseGetMatType(upTriFactor->descr);
812   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
813   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
814     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
815   diagType = cusparseGetMatDiagType(upTriFactor->descr);
816 
817   /* Create the matrix description */
818   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
819   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
820   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
821   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
822   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
823 
824   /* Create the solve analysis information */
825   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
826 
827   /* set the operation */
828   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
829 
830   /* allocate GPU space for the CSC of the upper triangular factor*/
831   upTriFactorT->csrMat = new CsrMatrix;
832   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
833   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
834   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
835   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
836   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
837   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
838 
839   /* compute the transpose of the upper triangular factor, i.e. the CSC */
840   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
841                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
842                           upTriFactor->csrMat->values->data().get(),
843                           upTriFactor->csrMat->row_offsets->data().get(),
844                           upTriFactor->csrMat->column_indices->data().get(),
845                           upTriFactorT->csrMat->values->data().get(),
846                           upTriFactorT->csrMat->column_indices->data().get(),
847                           upTriFactorT->csrMat->row_offsets->data().get(),
848                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
849 
850   /* perform the solve analysis on the transposed matrix */
851   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
852                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
853                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
854                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
855                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
856 
857   /* assign the pointer. Is this really necessary? */
858   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
859   PetscFunctionReturn(0);
860 }
861 
862 #undef __FUNCT__
863 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
865 {
866   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
867   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
868   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
869   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
870   cusparseStatus_t             stat;
871   cusparseIndexBase_t          indexBase;
872   cudaError_t                  err;
873 
874   PetscFunctionBegin;
875 
876   /* allocate space for the triangular factor information */
877   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
878   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
879   indexBase = cusparseGetMatIndexBase(matstruct->descr);
880   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
881   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
882 
883   /* set alpha and beta */
884   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
885   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
886   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
887   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
888   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
889 
890   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892     CsrMatrix *matrixT= new CsrMatrix;
893     matrixT->num_rows = A->rmap->n;
894     matrixT->num_cols = A->cmap->n;
895     matrixT->num_entries = a->nz;
896     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
897     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898     matrixT->values = new THRUSTARRAY(a->nz);
899 
900     /* compute the transpose of the upper triangular factor, i.e. the CSC */
901     indexBase = cusparseGetMatIndexBase(matstruct->descr);
902     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
903                             matrix->num_cols, matrix->num_entries,
904                             matrix->values->data().get(),
905                             matrix->row_offsets->data().get(),
906                             matrix->column_indices->data().get(),
907                             matrixT->values->data().get(),
908                             matrixT->column_indices->data().get(),
909                             matrixT->row_offsets->data().get(),
910                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
911 
912     /* assign the pointer */
913     matstructT->mat = matrixT;
914 
915   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
916 #if CUDA_VERSION>=5000
917     /* First convert HYB to CSR */
918     CsrMatrix *temp= new CsrMatrix;
919     temp->num_rows = A->rmap->n;
920     temp->num_cols = A->cmap->n;
921     temp->num_entries = a->nz;
922     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
923     temp->column_indices = new THRUSTINTARRAY32(a->nz);
924     temp->values = new THRUSTARRAY(a->nz);
925 
926 
927     stat = cusparse_hyb2csr(cusparsestruct->handle,
928                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
929                             temp->values->data().get(),
930                             temp->row_offsets->data().get(),
931                             temp->column_indices->data().get());CHKERRCUSP(stat);
932 
933     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
934     CsrMatrix *tempT= new CsrMatrix;
935     tempT->num_rows = A->rmap->n;
936     tempT->num_cols = A->cmap->n;
937     tempT->num_entries = a->nz;
938     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
940     tempT->values = new THRUSTARRAY(a->nz);
941 
942     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
943                             temp->num_cols, temp->num_entries,
944                             temp->values->data().get(),
945                             temp->row_offsets->data().get(),
946                             temp->column_indices->data().get(),
947                             tempT->values->data().get(),
948                             tempT->column_indices->data().get(),
949                             tempT->row_offsets->data().get(),
950                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
951 
952     /* Last, convert CSC to HYB */
953     cusparseHybMat_t hybMat;
954     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
955     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
956       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
957     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
958                             matstructT->descr, tempT->values->data().get(),
959                             tempT->row_offsets->data().get(),
960                             tempT->column_indices->data().get(),
961                             hybMat, 0, partition);CHKERRCUSP(stat);
962 
963     /* assign the pointer */
964     matstructT->mat = hybMat;
965 
966     /* delete temporaries */
967     if (tempT) {
968       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
969       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
970       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
971       delete (CsrMatrix*) tempT;
972     }
973     if (temp) {
974       if (temp->values) delete (THRUSTARRAY*) temp->values;
975       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
976       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
977       delete (CsrMatrix*) temp;
978     }
979 #else
980     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
981 #endif
982   }
983   /* assign the compressed row indices */
984   matstructT->cprowIndices = new THRUSTINTARRAY;
985 
986   /* assign the pointer */
987   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
988   PetscFunctionReturn(0);
989 }
990 
991 #undef __FUNCT__
992 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
993 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
994 {
995   CUSPARRAY                         *xGPU, *bGPU;
996   cusparseStatus_t                  stat;
997   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1000   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1001   PetscErrorCode                    ierr;
1002 
1003   PetscFunctionBegin;
1004   /* Analyze the matrix and create the transpose ... on the fly */
1005   if (!loTriFactorT && !upTriFactorT) {
1006     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1007     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1008     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1009   }
1010 
1011   /* Get the GPU pointers */
1012   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1013   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1014 
1015   /* First, reorder with the row permutation */
1016   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1017                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1018                xGPU->begin());
1019 
1020   /* First, solve U */
1021   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1022                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1023                         upTriFactorT->csrMat->values->data().get(),
1024                         upTriFactorT->csrMat->row_offsets->data().get(),
1025                         upTriFactorT->csrMat->column_indices->data().get(),
1026                         upTriFactorT->solveInfo,
1027                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1028 
1029   /* Then, solve L */
1030   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1031                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1032                         loTriFactorT->csrMat->values->data().get(),
1033                         loTriFactorT->csrMat->row_offsets->data().get(),
1034                         loTriFactorT->csrMat->column_indices->data().get(),
1035                         loTriFactorT->solveInfo,
1036                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1037 
1038   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1039   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1040                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1041                tempGPU->begin());
1042 
1043   /* Copy the temporary to the full solution. */
1044   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1045 
1046   /* restore */
1047   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1048   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1049   ierr = WaitForGPU();CHKERRCUSP(ierr);
1050 
1051   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 #undef __FUNCT__
1056 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058 {
1059   CUSPARRAY                         *xGPU,*bGPU;
1060   cusparseStatus_t                  stat;
1061   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1062   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1063   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1064   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1065   PetscErrorCode                    ierr;
1066 
1067   PetscFunctionBegin;
1068   /* Analyze the matrix and create the transpose ... on the fly */
1069   if (!loTriFactorT && !upTriFactorT) {
1070     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1071     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1072     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1073   }
1074 
1075   /* Get the GPU pointers */
1076   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1077   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1078 
1079   /* First, solve U */
1080   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1081                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1082                         upTriFactorT->csrMat->values->data().get(),
1083                         upTriFactorT->csrMat->row_offsets->data().get(),
1084                         upTriFactorT->csrMat->column_indices->data().get(),
1085                         upTriFactorT->solveInfo,
1086                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1087 
1088   /* Then, solve L */
1089   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1091                         loTriFactorT->csrMat->values->data().get(),
1092                         loTriFactorT->csrMat->row_offsets->data().get(),
1093                         loTriFactorT->csrMat->column_indices->data().get(),
1094                         loTriFactorT->solveInfo,
1095                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1096 
1097   /* restore */
1098   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1099   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1100   ierr = WaitForGPU();CHKERRCUSP(ierr);
1101   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 #undef __FUNCT__
1106 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1107 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1108 {
1109   CUSPARRAY                         *xGPU,*bGPU;
1110   cusparseStatus_t                  stat;
1111   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1112   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1113   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1114   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1115   PetscErrorCode                    ierr;
1116 
1117   PetscFunctionBegin;
1118   /* Get the GPU pointers */
1119   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1120   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1121 
1122   /* First, reorder with the row permutation */
1123   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1124                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1125                xGPU->begin());
1126 
1127   /* Next, solve L */
1128   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1129                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1130                         loTriFactor->csrMat->values->data().get(),
1131                         loTriFactor->csrMat->row_offsets->data().get(),
1132                         loTriFactor->csrMat->column_indices->data().get(),
1133                         loTriFactor->solveInfo,
1134                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1135 
1136   /* Then, solve U */
1137   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1138                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1139                         upTriFactor->csrMat->values->data().get(),
1140                         upTriFactor->csrMat->row_offsets->data().get(),
1141                         upTriFactor->csrMat->column_indices->data().get(),
1142                         upTriFactor->solveInfo,
1143                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1144 
1145   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1146   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1147                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1148                tempGPU->begin());
1149 
1150   /* Copy the temporary to the full solution. */
1151   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1152 
1153   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1154   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1155   ierr = WaitForGPU();CHKERRCUSP(ierr);
1156   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 #undef __FUNCT__
1161 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1162 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1163 {
1164   CUSPARRAY                         *xGPU,*bGPU;
1165   cusparseStatus_t                  stat;
1166   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1167   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1168   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1169   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1170   PetscErrorCode                    ierr;
1171 
1172   PetscFunctionBegin;
1173   /* Get the GPU pointers */
1174   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1175   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1176 
1177   /* First, solve L */
1178   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1179                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1180                         loTriFactor->csrMat->values->data().get(),
1181                         loTriFactor->csrMat->row_offsets->data().get(),
1182                         loTriFactor->csrMat->column_indices->data().get(),
1183                         loTriFactor->solveInfo,
1184                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1185 
1186   /* Next, solve U */
1187   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1188                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1189                         upTriFactor->csrMat->values->data().get(),
1190                         upTriFactor->csrMat->row_offsets->data().get(),
1191                         upTriFactor->csrMat->column_indices->data().get(),
1192                         upTriFactor->solveInfo,
1193                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1194 
1195   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1196   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1197   ierr = WaitForGPU();CHKERRCUSP(ierr);
1198   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 #undef __FUNCT__
1203 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1204 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1205 {
1206 
1207   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1209   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210   PetscInt                     m = A->rmap->n,*ii,*ridx;
1211   PetscErrorCode               ierr;
1212   cusparseStatus_t             stat;
1213   cudaError_t                  err;
1214 
1215   PetscFunctionBegin;
1216   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1217     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1218     if (matstruct) {
1219       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1220     }
1221     try {
1222       cusparsestruct->nonzerorow=0;
1223       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1224 
1225       if (a->compressedrow.use) {
1226         m    = a->compressedrow.nrows;
1227         ii   = a->compressedrow.i;
1228         ridx = a->compressedrow.rindex;
1229       } else {
1230         /* Forcing compressed row on the GPU */
1231         int k=0;
1232         ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr);
1233         ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr);
1234         ii[0]=0;
1235         for (int j = 0; j<m; j++) {
1236           if ((a->i[j+1]-a->i[j])>0) {
1237             ii[k]  = a->i[j];
1238             ridx[k]= j;
1239             k++;
1240           }
1241         }
1242         ii[cusparsestruct->nonzerorow] = a->nz;
1243         m = cusparsestruct->nonzerorow;
1244       }
1245 
1246       /* allocate space for the triangular factor information */
1247       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1248       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1249       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1250       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1251 
1252       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1253       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1254       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1255       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1256       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1257 
1258       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1259       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1260         /* set the matrix */
1261 	CsrMatrix *matrix= new CsrMatrix;
1262 	matrix->num_rows = m;
1263 	matrix->num_cols = A->cmap->n;
1264 	matrix->num_entries = a->nz;
1265 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1266 	matrix->row_offsets->assign(ii, ii + m+1);
1267 
1268 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1269 	matrix->column_indices->assign(a->j, a->j+a->nz);
1270 
1271 	matrix->values = new THRUSTARRAY(a->nz);
1272 	matrix->values->assign(a->a, a->a+a->nz);
1273 
1274         /* assign the pointer */
1275         matstruct->mat = matrix;
1276 
1277       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1278 #if CUDA_VERSION>=4020
1279 	CsrMatrix *matrix= new CsrMatrix;
1280 	matrix->num_rows = m;
1281 	matrix->num_cols = A->cmap->n;
1282 	matrix->num_entries = a->nz;
1283 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1284 	matrix->row_offsets->assign(ii, ii + m+1);
1285 
1286 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1287 	matrix->column_indices->assign(a->j, a->j+a->nz);
1288 
1289 	matrix->values = new THRUSTARRAY(a->nz);
1290 	matrix->values->assign(a->a, a->a+a->nz);
1291 
1292         cusparseHybMat_t hybMat;
1293         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1294         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1295           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1296         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1297                                 matstruct->descr, matrix->values->data().get(),
1298                                 matrix->row_offsets->data().get(),
1299                                 matrix->column_indices->data().get(),
1300                                 hybMat, 0, partition);CHKERRCUSP(stat);
1301         /* assign the pointer */
1302         matstruct->mat = hybMat;
1303 
1304 	if (matrix) {
1305 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1306 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1307 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1308 	  delete (CsrMatrix*)matrix;
1309 	}
1310 #endif
1311       }
1312 
1313       /* assign the compressed row indices */
1314       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1315       matstruct->cprowIndices->assign(ridx,ridx+m);
1316 
1317       /* assign the pointer */
1318       cusparsestruct->mat = matstruct;
1319 
1320       if (!a->compressedrow.use) {
1321         ierr = PetscFree(ii);CHKERRQ(ierr);
1322         ierr = PetscFree(ridx);CHKERRQ(ierr);
1323       }
1324       cusparsestruct->workVector = new THRUSTARRAY;
1325       cusparsestruct->workVector->resize(m);
1326     } catch(char *ex) {
1327       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1328     }
1329     ierr = WaitForGPU();CHKERRCUSP(ierr);
1330 
1331     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1332 
1333     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1334   }
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 #undef __FUNCT__
1339 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
1340 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1341 {
1342   PetscErrorCode ierr;
1343   PetscInt rbs,cbs;
1344 
1345   PetscFunctionBegin;
1346   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1347   if (right) {
1348     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1349     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1350     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1351     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1352     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1353   }
1354   if (left) {
1355     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1356     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1357     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1358     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1359     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1360   }
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 struct VecCUSPPlusEquals
1365 {
1366   template <typename Tuple>
1367   __host__ __device__
1368   void operator()(Tuple t)
1369   {
1370     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1371   }
1372 };
1373 
1374 #undef __FUNCT__
1375 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1376 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1377 {
1378   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1379   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1380   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1381   CUSPARRAY                    *xarray,*yarray;
1382   PetscErrorCode               ierr;
1383   cusparseStatus_t             stat;
1384 
1385   PetscFunctionBegin;
1386   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1387      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1388   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1389   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1390   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1391     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1392     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1393                              mat->num_rows, mat->num_cols, mat->num_entries,
1394                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1395                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1396                              yarray->data().get());CHKERRCUSP(stat);
1397   } else {
1398 #if CUDA_VERSION>=4020
1399     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1400     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1401                              matstruct->alpha, matstruct->descr, hybMat,
1402                              xarray->data().get(), matstruct->beta,
1403                              yarray->data().get());CHKERRCUSP(stat);
1404 #endif
1405   }
1406   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1407   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1408   if (!cusparsestruct->stream) {
1409     ierr = WaitForGPU();CHKERRCUSP(ierr);
1410   }
1411   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 #undef __FUNCT__
1416 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1417 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1418 {
1419   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1420   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1421   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1422   CUSPARRAY                    *xarray,*yarray;
1423   PetscErrorCode               ierr;
1424   cusparseStatus_t             stat;
1425 
1426   PetscFunctionBegin;
1427   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1428      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1429   if (!matstructT) {
1430     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1431     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1432   }
1433   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1434   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1435 
1436   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1437     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1438     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1439                              mat->num_rows, mat->num_cols,
1440                              mat->num_entries, matstructT->alpha, matstructT->descr,
1441                              mat->values->data().get(), mat->row_offsets->data().get(),
1442                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1443                              yarray->data().get());CHKERRCUSP(stat);
1444   } else {
1445 #if CUDA_VERSION>=4020
1446     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1447     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1448                              matstructT->alpha, matstructT->descr, hybMat,
1449                              xarray->data().get(), matstructT->beta,
1450                              yarray->data().get());CHKERRCUSP(stat);
1451 #endif
1452   }
1453   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1454   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1455   if (!cusparsestruct->stream) {
1456     ierr = WaitForGPU();CHKERRCUSP(ierr);
1457   }
1458   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1459   PetscFunctionReturn(0);
1460 }
1461 
1462 
1463 #undef __FUNCT__
1464 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1465 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1466 {
1467   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1468   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1469   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1470   CUSPARRAY                    *xarray,*yarray,*zarray;
1471   PetscErrorCode               ierr;
1472   cusparseStatus_t             stat;
1473 
1474   PetscFunctionBegin;
1475   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1476      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1477   try {
1478     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1479     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1480     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1481     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1482 
1483     /* multiply add */
1484     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1485       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1486       /* here we need to be careful to set the number of rows in the multiply to the
1487 	 number of compressed rows in the matrix ... which is equivalent to the
1488 	 size of the workVector */
1489       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1490                                mat->num_rows, mat->num_cols,
1491                                mat->num_entries, matstruct->alpha, matstruct->descr,
1492                                mat->values->data().get(), mat->row_offsets->data().get(),
1493                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1494                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1495     } else {
1496 #if CUDA_VERSION>=4020
1497       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1498       if (cusparsestruct->workVector->size()) {
1499 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1500 				 matstruct->alpha, matstruct->descr, hybMat,
1501 				 xarray->data().get(), matstruct->beta,
1502 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1503       }
1504 #endif
1505     }
1506 
1507     /* scatter the data from the temporary into the full vector with a += operation */
1508     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1509 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1510 		     VecCUSPPlusEquals());
1511     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1512     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1513     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1514 
1515   } catch(char *ex) {
1516     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1517   }
1518   ierr = WaitForGPU();CHKERRCUSP(ierr);
1519   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1520   PetscFunctionReturn(0);
1521 }
1522 
1523 #undef __FUNCT__
1524 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1525 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1526 {
1527   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1528   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1529   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1530   CUSPARRAY                    *xarray,*yarray,*zarray;
1531   PetscErrorCode               ierr;
1532   cusparseStatus_t             stat;
1533 
1534   PetscFunctionBegin;
1535   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1536      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1537   if (!matstructT) {
1538     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1539     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1540   }
1541 
1542   try {
1543     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1544     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1545     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1546     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1547 
1548     /* multiply add with matrix transpose */
1549     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1550       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1551       /* here we need to be careful to set the number of rows in the multiply to the
1552 	 number of compressed rows in the matrix ... which is equivalent to the
1553 	 size of the workVector */
1554       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1555                                mat->num_rows, mat->num_cols,
1556 			       mat->num_entries, matstructT->alpha, matstructT->descr,
1557                                mat->values->data().get(), mat->row_offsets->data().get(),
1558                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1559                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1560     } else {
1561 #if CUDA_VERSION>=4020
1562       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1563       if (cusparsestruct->workVector->size()) {
1564 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1565 				 matstructT->alpha, matstructT->descr, hybMat,
1566 				 xarray->data().get(), matstructT->beta,
1567 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1568       }
1569 #endif
1570     }
1571 
1572     /* scatter the data from the temporary into the full vector with a += operation */
1573     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1574 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1575 		     VecCUSPPlusEquals());
1576 
1577     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1578     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1579     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1580 
1581   } catch(char *ex) {
1582     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1583   }
1584   ierr = WaitForGPU();CHKERRCUSP(ierr);
1585   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 #undef __FUNCT__
1590 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1591 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1592 {
1593   PetscErrorCode ierr;
1594 
1595   PetscFunctionBegin;
1596   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1597   if (A->factortype==MAT_FACTOR_NONE) {
1598     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1599   }
1600   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1601   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1602   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1603   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1604   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 /* --------------------------------------------------------------------------------*/
1609 #undef __FUNCT__
1610 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1611 /*@
1612    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1613    (the default parallel PETSc format). This matrix will ultimately pushed down
1614    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1615    assembly performance the user should preallocate the matrix storage by setting
1616    the parameter nz (or the array nnz).  By setting these parameters accurately,
1617    performance during matrix assembly can be increased by more than a factor of 50.
1618 
1619    Collective on MPI_Comm
1620 
1621    Input Parameters:
1622 +  comm - MPI communicator, set to PETSC_COMM_SELF
1623 .  m - number of rows
1624 .  n - number of columns
1625 .  nz - number of nonzeros per row (same for all rows)
1626 -  nnz - array containing the number of nonzeros in the various rows
1627          (possibly different for each row) or NULL
1628 
1629    Output Parameter:
1630 .  A - the matrix
1631 
1632    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1633    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1634    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1635 
1636    Notes:
1637    If nnz is given then nz is ignored
1638 
1639    The AIJ format (also called the Yale sparse matrix format or
1640    compressed row storage), is fully compatible with standard Fortran 77
1641    storage.  That is, the stored row and column indices can begin at
1642    either one (as in Fortran) or zero.  See the users' manual for details.
1643 
1644    Specify the preallocated storage with either nz or nnz (not both).
1645    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1646    allocation.  For large problems you MUST preallocate memory or you
1647    will get TERRIBLE performance, see the users' manual chapter on matrices.
1648 
1649    By default, this format uses inodes (identical nodes) when possible, to
1650    improve numerical efficiency of matrix-vector products and solves. We
1651    search for consecutive rows with the same nonzero structure, thereby
1652    reusing matrix information to achieve increased efficiency.
1653 
1654    Level: intermediate
1655 
1656 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1657 @*/
1658 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1659 {
1660   PetscErrorCode ierr;
1661 
1662   PetscFunctionBegin;
1663   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1664   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1665   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1666   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1667   PetscFunctionReturn(0);
1668 }
1669 
1670 #undef __FUNCT__
1671 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1672 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1673 {
1674   PetscErrorCode   ierr;
1675   cusparseStatus_t stat;
1676   cudaError_t      err;
1677   PetscFunctionBegin;
1678   if (A->factortype==MAT_FACTOR_NONE) {
1679     try {
1680       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1681         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1682         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1683         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1684 
1685         /* delete all the members in each of the data structures */
1686         if (matstruct) {
1687           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1688           if (matstruct->mat) {
1689             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1690 #if CUDA_VERSION>=4020
1691               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1692               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1693 #endif
1694             } else {
1695 	      CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1696 	      if (mat->values) delete (THRUSTARRAY*)mat->values;
1697 	      if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1698 	      if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1699               delete (CsrMatrix*)matstruct->mat;
1700             }
1701           }
1702 	  if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1703 	  if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1704           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1705           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1706         }
1707         if (matstructT) {
1708           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1709           if (matstructT->mat) {
1710             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1711 #if CUDA_VERSION>=4020
1712               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1713               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1714 #endif
1715             } else {
1716 	      CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1717 	      if (matT->values) delete (THRUSTARRAY*)matT->values;
1718 	      if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1719 	      if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1720               delete (CsrMatrix*)matstructT->mat;
1721             }
1722           }
1723 	  if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1724 	  if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1725           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1726           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1727         }
1728         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1729         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1730         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1731         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1732       } else {
1733         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1734         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1735         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1736       }
1737     } catch(char *ex) {
1738       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1739     }
1740   } else {
1741     /* The triangular factors */
1742     try {
1743       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1744       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1745       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1746       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1747       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1748 
1749       /* delete all the members in each of the data structures */
1750       if (loTriFactor) {
1751         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1752         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1753         if (loTriFactor->csrMat) {
1754 	  if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1755 	  if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1756 	  if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1757 	  delete (CsrMatrix*)loTriFactor->csrMat;
1758 	}
1759         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1760       }
1761 
1762       if (upTriFactor) {
1763         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1764         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1765         if (upTriFactor->csrMat) {
1766 	  if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1767 	  if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1768 	  if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1769 	  delete (CsrMatrix*)upTriFactor->csrMat;
1770 	}
1771         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1772       }
1773 
1774       if (loTriFactorT) {
1775         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1776         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1777         if (loTriFactorT->csrMat) {
1778 	  if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1779 	  if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1780 	  if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1781 	  delete (CsrMatrix*)loTriFactorT->csrMat;
1782 	}
1783         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1784       }
1785 
1786       if (upTriFactorT) {
1787         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1788         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1789         if (upTriFactorT->csrMat) {
1790 	  if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1791 	  if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1792 	  if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1793 	  delete (CsrMatrix*)upTriFactorT->csrMat;
1794 	}
1795         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1796       }
1797       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1798       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1799       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1800       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1801       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1802     } catch(char *ex) {
1803       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1804     }
1805   }
1806   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1807   A->spptr = 0;
1808 
1809   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 #undef __FUNCT__
1814 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1815 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1816 {
1817   PetscErrorCode ierr;
1818   cusparseStatus_t stat;
1819   cusparseHandle_t handle=0;
1820 
1821   PetscFunctionBegin;
1822   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1823   if (B->factortype==MAT_FACTOR_NONE) {
1824     /* you cannot check the inode.use flag here since the matrix was just created.
1825        now build a GPU matrix data structure */
1826     B->spptr = new Mat_SeqAIJCUSPARSE;
1827     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1828     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1829     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1830     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1831     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1832     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1833     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1834     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1835     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1836   } else {
1837     /* NEXT, set the pointers to the triangular factors */
1838     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1839     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1840     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1841     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1842     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1843     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1844     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1845     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1846     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1847     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1848     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1849     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1850   }
1851 
1852   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1853      default cusparse tri solve. Note the difference with the implementation in
1854      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1855   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1856 
1857   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1858   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1859   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1860   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1861   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1862   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1863   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1864   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1865 
1866   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1867 
1868   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1869 
1870   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1871   PetscFunctionReturn(0);
1872 }
1873 
1874 /*M
1875    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1876 
1877    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1878    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1879    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1880 
1881    Options Database Keys:
1882 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1883 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1884 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1885 
1886   Level: beginner
1887 
1888 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1889 M*/
1890