xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a183c035f1d63567095ad55d51f6b3f2111532ac)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include <petscconf.h>
7 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include <../src/vec/vec/impls/dvecimpl.h>
10 #include <petsc-private/vecimpl.h>
11 #undef VecType
12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 #undef __FUNCT__
35 #define __FUNCT__ "MatCUSPARSESetStream"
36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
37 {
38   cusparseStatus_t   stat;
39   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
40 
41   PetscFunctionBegin;
42   cusparsestruct->stream = stream;
43   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
44   PetscFunctionReturn(0);
45 }
46 
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCUSPARSESetHandle"
49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
50 {
51   cusparseStatus_t   stat;
52   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53 
54   PetscFunctionBegin;
55   if (cusparsestruct->handle)
56     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
57   cusparsestruct->handle = handle;
58   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
59   PetscFunctionReturn(0);
60 }
61 
62 #undef __FUNCT__
63 #define __FUNCT__ "MatCUSPARSEClearHandle"
64 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
65 {
66   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67   PetscFunctionBegin;
68   if (cusparsestruct->handle)
69     cusparsestruct->handle = 0;
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
76 {
77   PetscFunctionBegin;
78   *type = MATSOLVERCUSPARSE;
79   PetscFunctionReturn(0);
80 }
81 
82 /*MC
83   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88   algorithms are not recommended. This class does NOT support direct solver operations.
89 
90   Level: beginner
91 
92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
93 M*/
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 #undef __FUNCT__
123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129 #if CUDA_VERSION>=4020
130   switch (op) {
131   case MAT_CUSPARSE_MULT:
132     cusparsestruct->format = format;
133     break;
134   case MAT_CUSPARSE_ALL:
135     cusparsestruct->format = format;
136     break;
137   default:
138     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139   }
140 #else
141   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
142     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
143 #endif
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149    operation. Only the MatMult operation can use different GPU storage formats
150    for MPIAIJCUSPARSE matrices.
151    Not Collective
152 
153    Input Parameters:
154 +  A - Matrix of type SEQAIJCUSPARSE
155 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
157 
158    Output Parameter:
159 
160    Level: intermediate
161 
162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163 @*/
164 #undef __FUNCT__
165 #define __FUNCT__ "MatCUSPARSESetFormat"
166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
167 {
168   PetscErrorCode ierr;
169 
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
172   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
179 {
180   PetscErrorCode           ierr;
181   MatCUSPARSEStorageFormat format;
182   PetscBool                flg;
183   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
184 
185   PetscFunctionBegin;
186   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
187   ierr = PetscObjectOptionsBegin((PetscObject)A);
188   if (A->factortype==MAT_FACTOR_NONE) {
189     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
190                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
191     if (flg) {
192       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
193     }
194   }
195   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
196                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197   if (flg) {
198     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
199   }
200   ierr = PetscOptionsEnd();CHKERRQ(ierr);
201   PetscFunctionReturn(0);
202 
203 }
204 
205 #undef __FUNCT__
206 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
207 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 #undef __FUNCT__
218 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
219 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
220 {
221   PetscErrorCode ierr;
222 
223   PetscFunctionBegin;
224   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
225   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
226   PetscFunctionReturn(0);
227 }
228 
229 #undef __FUNCT__
230 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
231 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
237   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
238   PetscFunctionReturn(0);
239 }
240 
241 #undef __FUNCT__
242 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
243 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
249   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
250   PetscFunctionReturn(0);
251 }
252 
253 #undef __FUNCT__
254 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
255 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
256 {
257   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
258   PetscInt                          n = A->rmap->n;
259   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
260   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
261   cusparseStatus_t                  stat;
262   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
263   const MatScalar                   *aa = a->a,*v;
264   PetscInt                          *AiLo, *AjLo;
265   PetscScalar                       *AALo;
266   PetscInt                          i,nz, nzLower, offset, rowOffset;
267   PetscErrorCode                    ierr;
268 
269   PetscFunctionBegin;
270   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
271     try {
272       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
273       nzLower=n+ai[n]-ai[1];
274 
275       /* Allocate Space for the lower triangular matrix */
276       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
277       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
278       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
279 
280       /* Fill the lower triangular matrix */
281       AiLo[0]  = (PetscInt) 0;
282       AiLo[n]  = nzLower;
283       AjLo[0]  = (PetscInt) 0;
284       AALo[0]  = (MatScalar) 1.0;
285       v        = aa;
286       vi       = aj;
287       offset   = 1;
288       rowOffset= 1;
289       for (i=1; i<n; i++) {
290         nz = ai[i+1] - ai[i];
291         /* additional 1 for the term on the diagonal */
292         AiLo[i]    = rowOffset;
293         rowOffset += nz+1;
294 
295         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
296         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
297 
298         offset      += nz;
299         AjLo[offset] = (PetscInt) i;
300         AALo[offset] = (MatScalar) 1.0;
301         offset      += 1;
302 
303         v  += nz;
304         vi += nz;
305       }
306 
307       /* allocate space for the triangular factor information */
308       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
309 
310       /* Create the matrix description */
311       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
312       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
313       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
314       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
315       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
316 
317       /* Create the solve analysis information */
318       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
319 
320       /* set the operation */
321       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
322 
323       /* set the matrix */
324       loTriFactor->csrMat = new CsrMatrix;
325       loTriFactor->csrMat->num_rows = n;
326       loTriFactor->csrMat->num_cols = n;
327       loTriFactor->csrMat->num_entries = nzLower;
328 
329       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
330       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
331 
332       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
333       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
334 
335       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
336       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
337 
338       /* perform the solve analysis */
339       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
340                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
341                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
342                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
343 
344       /* assign the pointer. Is this really necessary? */
345       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
346 
347       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
348       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
349       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
350     } catch(char *ex) {
351       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
352     }
353   }
354   PetscFunctionReturn(0);
355 }
356 
357 #undef __FUNCT__
358 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
359 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
360 {
361   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
362   PetscInt                          n = A->rmap->n;
363   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
364   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
365   cusparseStatus_t                  stat;
366   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
367   const MatScalar                   *aa = a->a,*v;
368   PetscInt                          *AiUp, *AjUp;
369   PetscScalar                       *AAUp;
370   PetscInt                          i,nz, nzUpper, offset;
371   PetscErrorCode                    ierr;
372 
373   PetscFunctionBegin;
374   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
375     try {
376       /* next, figure out the number of nonzeros in the upper triangular matrix. */
377       nzUpper = adiag[0]-adiag[n];
378 
379       /* Allocate Space for the upper triangular matrix */
380       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
381       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
382       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
383 
384       /* Fill the upper triangular matrix */
385       AiUp[0]=(PetscInt) 0;
386       AiUp[n]=nzUpper;
387       offset = nzUpper;
388       for (i=n-1; i>=0; i--) {
389         v  = aa + adiag[i+1] + 1;
390         vi = aj + adiag[i+1] + 1;
391 
392         /* number of elements NOT on the diagonal */
393         nz = adiag[i] - adiag[i+1]-1;
394 
395         /* decrement the offset */
396         offset -= (nz+1);
397 
398         /* first, set the diagonal elements */
399         AjUp[offset] = (PetscInt) i;
400         AAUp[offset] = 1./v[nz];
401         AiUp[i]      = AiUp[i+1] - (nz+1);
402 
403         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
404         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
405       }
406 
407       /* allocate space for the triangular factor information */
408       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
409 
410       /* Create the matrix description */
411       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
412       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
413       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
414       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
415       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
416 
417       /* Create the solve analysis information */
418       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
419 
420       /* set the operation */
421       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
422 
423       /* set the matrix */
424       upTriFactor->csrMat = new CsrMatrix;
425       upTriFactor->csrMat->num_rows = n;
426       upTriFactor->csrMat->num_cols = n;
427       upTriFactor->csrMat->num_entries = nzUpper;
428 
429       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
430       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
431 
432       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
433       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
434 
435       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
436       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
437 
438       /* perform the solve analysis */
439       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
440                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
441                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
442                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
443 
444       /* assign the pointer. Is this really necessary? */
445       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
446 
447       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
448       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
449       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
450     } catch(char *ex) {
451       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
452     }
453   }
454   PetscFunctionReturn(0);
455 }
456 
457 #undef __FUNCT__
458 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
459 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
460 {
461   PetscErrorCode               ierr;
462   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
463   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
464   IS                           isrow = a->row,iscol = a->icol;
465   PetscBool                    row_identity,col_identity;
466   const PetscInt               *r,*c;
467   PetscInt                     n = A->rmap->n;
468 
469   PetscFunctionBegin;
470   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
471   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
472 
473   cusparseTriFactors->workVector = new THRUSTARRAY;
474   cusparseTriFactors->workVector->resize(n);
475   cusparseTriFactors->nnz=a->nz;
476 
477   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
478   /*lower triangular indices */
479   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
480   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
481   if (!row_identity) {
482     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
483     cusparseTriFactors->rpermIndices->assign(r, r+n);
484   }
485   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
486 
487   /*upper triangular indices */
488   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
489   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
490   if (!col_identity) {
491     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
492     cusparseTriFactors->cpermIndices->assign(c, c+n);
493   }
494   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
500 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
501 {
502   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
503   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
504   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
505   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
506   cusparseStatus_t                  stat;
507   PetscErrorCode                    ierr;
508   PetscInt                          *AiUp, *AjUp;
509   PetscScalar                       *AAUp;
510   PetscScalar                       *AALo;
511   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
512   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
513   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
514   const MatScalar                   *aa = b->a,*v;
515 
516   PetscFunctionBegin;
517   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
518     try {
519       /* Allocate Space for the upper triangular matrix */
520       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
521       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
522       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
523       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
524 
525       /* Fill the upper triangular matrix */
526       AiUp[0]=(PetscInt) 0;
527       AiUp[n]=nzUpper;
528       offset = 0;
529       for (i=0; i<n; i++) {
530         /* set the pointers */
531         v  = aa + ai[i];
532         vj = aj + ai[i];
533         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
534 
535         /* first, set the diagonal elements */
536         AjUp[offset] = (PetscInt) i;
537         AAUp[offset] = 1.0/v[nz];
538         AiUp[i]      = offset;
539         AALo[offset] = 1.0/v[nz];
540 
541         offset+=1;
542         if (nz>0) {
543           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
544           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
545           for (j=offset; j<offset+nz; j++) {
546             AAUp[j] = -AAUp[j];
547             AALo[j] = AAUp[j]/v[nz];
548           }
549           offset+=nz;
550         }
551       }
552 
553       /* allocate space for the triangular factor information */
554       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
555 
556       /* Create the matrix description */
557       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
558       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
559       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
560       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
561       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
562 
563       /* Create the solve analysis information */
564       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
565 
566       /* set the operation */
567       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
568 
569       /* set the matrix */
570       upTriFactor->csrMat = new CsrMatrix;
571       upTriFactor->csrMat->num_rows = A->rmap->n;
572       upTriFactor->csrMat->num_cols = A->cmap->n;
573       upTriFactor->csrMat->num_entries = a->nz;
574 
575       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
576       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
577 
578       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
579       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
580 
581       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
582       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
583 
584       /* perform the solve analysis */
585       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
586                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
587                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
588                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
589 
590       /* assign the pointer. Is this really necessary? */
591       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
592 
593       /* allocate space for the triangular factor information */
594       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
595 
596       /* Create the matrix description */
597       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
598       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
599       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
600       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
601       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
602 
603       /* Create the solve analysis information */
604       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
605 
606       /* set the operation */
607       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
608 
609       /* set the matrix */
610       loTriFactor->csrMat = new CsrMatrix;
611       loTriFactor->csrMat->num_rows = A->rmap->n;
612       loTriFactor->csrMat->num_cols = A->cmap->n;
613       loTriFactor->csrMat->num_entries = a->nz;
614 
615       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
616       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
617 
618       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
619       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
620 
621       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
622       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
623 
624       /* perform the solve analysis */
625       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
626                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
627                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
628                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
629 
630       /* assign the pointer. Is this really necessary? */
631       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
632 
633       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
634       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
635       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
636       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
637       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
638     } catch(char *ex) {
639       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
640     }
641   }
642   PetscFunctionReturn(0);
643 }
644 
645 #undef __FUNCT__
646 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
647 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
648 {
649   PetscErrorCode               ierr;
650   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
651   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
652   IS                           ip = a->row;
653   const PetscInt               *rip;
654   PetscBool                    perm_identity;
655   PetscInt                     n = A->rmap->n;
656 
657   PetscFunctionBegin;
658   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
659   cusparseTriFactors->workVector = new THRUSTARRAY;
660   cusparseTriFactors->workVector->resize(n);
661   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
662 
663   /*lower triangular indices */
664   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
665   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
666   if (!perm_identity) {
667     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
668     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
669     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
670     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
671   }
672   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
673   PetscFunctionReturn(0);
674 }
675 
676 #undef __FUNCT__
677 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
678 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
679 {
680   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
681   IS             isrow = b->row,iscol = b->col;
682   PetscBool      row_identity,col_identity;
683   PetscErrorCode ierr;
684 
685   PetscFunctionBegin;
686   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
687   /* determine which version of MatSolve needs to be used. */
688   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
689   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690   if (row_identity && col_identity) {
691     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
692     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
693   } else {
694     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
695     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
696   }
697 
698   /* get the triangular factors */
699   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
700   PetscFunctionReturn(0);
701 }
702 
703 #undef __FUNCT__
704 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
705 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
706 {
707   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
708   IS             ip = b->row;
709   PetscBool      perm_identity;
710   PetscErrorCode ierr;
711 
712   PetscFunctionBegin;
713   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
714 
715   /* determine which version of MatSolve needs to be used. */
716   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
717   if (perm_identity) {
718     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
719     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
720   } else {
721     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
722     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
723   }
724 
725   /* get the triangular factors */
726   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 #undef __FUNCT__
731 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
732 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
733 {
734   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
736   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
737   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
738   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
739   cusparseStatus_t                  stat;
740   cusparseIndexBase_t               indexBase;
741   cusparseMatrixType_t              matrixType;
742   cusparseFillMode_t                fillMode;
743   cusparseDiagType_t                diagType;
744 
745   PetscFunctionBegin;
746 
747   /*********************************************/
748   /* Now the Transpose of the Lower Tri Factor */
749   /*********************************************/
750 
751   /* allocate space for the transpose of the lower triangular factor */
752   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
753 
754   /* set the matrix descriptors of the lower triangular factor */
755   matrixType = cusparseGetMatType(loTriFactor->descr);
756   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
757   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
758     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
759   diagType = cusparseGetMatDiagType(loTriFactor->descr);
760 
761   /* Create the matrix description */
762   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
763   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
764   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
765   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
766   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
767 
768   /* Create the solve analysis information */
769   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
770 
771   /* set the operation */
772   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
773 
774   /* allocate GPU space for the CSC of the lower triangular factor*/
775   loTriFactorT->csrMat = new CsrMatrix;
776   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
777   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
778   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
779   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
780   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
781   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
782 
783   /* compute the transpose of the lower triangular factor, i.e. the CSC */
784   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
785                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
786                           loTriFactor->csrMat->values->data().get(),
787                           loTriFactor->csrMat->row_offsets->data().get(),
788                           loTriFactor->csrMat->column_indices->data().get(),
789                           loTriFactorT->csrMat->values->data().get(),
790                           loTriFactorT->csrMat->column_indices->data().get(),
791                           loTriFactorT->csrMat->row_offsets->data().get(),
792                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
793 
794   /* perform the solve analysis on the transposed matrix */
795   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
796                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
797                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
798                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
799                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
800 
801   /* assign the pointer. Is this really necessary? */
802   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
803 
804   /*********************************************/
805   /* Now the Transpose of the Upper Tri Factor */
806   /*********************************************/
807 
808   /* allocate space for the transpose of the upper triangular factor */
809   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
810 
811   /* set the matrix descriptors of the upper triangular factor */
812   matrixType = cusparseGetMatType(upTriFactor->descr);
813   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
814   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
815     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
816   diagType = cusparseGetMatDiagType(upTriFactor->descr);
817 
818   /* Create the matrix description */
819   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
820   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
821   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
822   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
823   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
824 
825   /* Create the solve analysis information */
826   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
827 
828   /* set the operation */
829   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
830 
831   /* allocate GPU space for the CSC of the upper triangular factor*/
832   upTriFactorT->csrMat = new CsrMatrix;
833   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
834   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
835   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
836   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
837   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
838   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
839 
840   /* compute the transpose of the upper triangular factor, i.e. the CSC */
841   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
842                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
843                           upTriFactor->csrMat->values->data().get(),
844                           upTriFactor->csrMat->row_offsets->data().get(),
845                           upTriFactor->csrMat->column_indices->data().get(),
846                           upTriFactorT->csrMat->values->data().get(),
847                           upTriFactorT->csrMat->column_indices->data().get(),
848                           upTriFactorT->csrMat->row_offsets->data().get(),
849                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
850 
851   /* perform the solve analysis on the transposed matrix */
852   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
853                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
854                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
855                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
856                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
857 
858   /* assign the pointer. Is this really necessary? */
859   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
860   PetscFunctionReturn(0);
861 }
862 
863 #undef __FUNCT__
864 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
865 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
866 {
867   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
868   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
869   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
870   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
871   cusparseStatus_t             stat;
872   cusparseIndexBase_t          indexBase;
873   cudaError_t                  err;
874 
875   PetscFunctionBegin;
876 
877   /* allocate space for the triangular factor information */
878   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
879   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
880   indexBase = cusparseGetMatIndexBase(matstruct->descr);
881   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
882   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
883 
884   /* set alpha and beta */
885   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
886   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
887   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
888   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
889   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
890 
891   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
892     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
893     CsrMatrix *matrixT= new CsrMatrix;
894     matrixT->num_rows = A->rmap->n;
895     matrixT->num_cols = A->cmap->n;
896     matrixT->num_entries = a->nz;
897     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
898     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
899     matrixT->values = new THRUSTARRAY(a->nz);
900 
901     /* compute the transpose of the upper triangular factor, i.e. the CSC */
902     indexBase = cusparseGetMatIndexBase(matstruct->descr);
903     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
904                             matrix->num_cols, matrix->num_entries,
905                             matrix->values->data().get(),
906                             matrix->row_offsets->data().get(),
907                             matrix->column_indices->data().get(),
908                             matrixT->values->data().get(),
909                             matrixT->column_indices->data().get(),
910                             matrixT->row_offsets->data().get(),
911                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
912 
913     /* assign the pointer */
914     matstructT->mat = matrixT;
915 
916   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
917 #if CUDA_VERSION>=5000
918     /* First convert HYB to CSR */
919     CsrMatrix *temp= new CsrMatrix;
920     temp->num_rows = A->rmap->n;
921     temp->num_cols = A->cmap->n;
922     temp->num_entries = a->nz;
923     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
924     temp->column_indices = new THRUSTINTARRAY32(a->nz);
925     temp->values = new THRUSTARRAY(a->nz);
926 
927 
928     stat = cusparse_hyb2csr(cusparsestruct->handle,
929                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
930                             temp->values->data().get(),
931                             temp->row_offsets->data().get(),
932                             temp->column_indices->data().get());CHKERRCUSP(stat);
933 
934     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
935     CsrMatrix *tempT= new CsrMatrix;
936     tempT->num_rows = A->rmap->n;
937     tempT->num_cols = A->cmap->n;
938     tempT->num_entries = a->nz;
939     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
940     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
941     tempT->values = new THRUSTARRAY(a->nz);
942 
943     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
944                             temp->num_cols, temp->num_entries,
945                             temp->values->data().get(),
946                             temp->row_offsets->data().get(),
947                             temp->column_indices->data().get(),
948                             tempT->values->data().get(),
949                             tempT->column_indices->data().get(),
950                             tempT->row_offsets->data().get(),
951                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
952 
953     /* Last, convert CSC to HYB */
954     cusparseHybMat_t hybMat;
955     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
956     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
957       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
958     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
959                             matstructT->descr, tempT->values->data().get(),
960                             tempT->row_offsets->data().get(),
961                             tempT->column_indices->data().get(),
962                             hybMat, 0, partition);CHKERRCUSP(stat);
963 
964     /* assign the pointer */
965     matstructT->mat = hybMat;
966 
967     /* delete temporaries */
968     if (tempT) {
969       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
970       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
971       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
972       delete (CsrMatrix*) tempT;
973     }
974     if (temp) {
975       if (temp->values) delete (THRUSTARRAY*) temp->values;
976       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
977       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
978       delete (CsrMatrix*) temp;
979     }
980 #else
981     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
982 #endif
983   }
984   /* assign the compressed row indices */
985   matstructT->cprowIndices = new THRUSTINTARRAY;
986 
987   /* assign the pointer */
988   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
989   PetscFunctionReturn(0);
990 }
991 
992 #undef __FUNCT__
993 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
994 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
995 {
996   CUSPARRAY                         *xGPU, *bGPU;
997   cusparseStatus_t                  stat;
998   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
999   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1001   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1002   PetscErrorCode                    ierr;
1003 
1004   PetscFunctionBegin;
1005   /* Analyze the matrix and create the transpose ... on the fly */
1006   if (!loTriFactorT && !upTriFactorT) {
1007     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1008     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1009     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1010   }
1011 
1012   /* Get the GPU pointers */
1013   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1014   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1015 
1016   /* First, reorder with the row permutation */
1017   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1018                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1019                xGPU->begin());
1020 
1021   /* First, solve U */
1022   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1023                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1024                         upTriFactorT->csrMat->values->data().get(),
1025                         upTriFactorT->csrMat->row_offsets->data().get(),
1026                         upTriFactorT->csrMat->column_indices->data().get(),
1027                         upTriFactorT->solveInfo,
1028                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1029 
1030   /* Then, solve L */
1031   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1032                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1033                         loTriFactorT->csrMat->values->data().get(),
1034                         loTriFactorT->csrMat->row_offsets->data().get(),
1035                         loTriFactorT->csrMat->column_indices->data().get(),
1036                         loTriFactorT->solveInfo,
1037                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1038 
1039   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1040   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1041                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1042                tempGPU->begin());
1043 
1044   /* Copy the temporary to the full solution. */
1045   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1046 
1047   /* restore */
1048   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1049   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1050   ierr = WaitForGPU();CHKERRCUSP(ierr);
1051 
1052   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1053   PetscFunctionReturn(0);
1054 }
1055 
1056 #undef __FUNCT__
1057 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1058 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1059 {
1060   CUSPARRAY                         *xGPU,*bGPU;
1061   cusparseStatus_t                  stat;
1062   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1063   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1064   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1065   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1066   PetscErrorCode                    ierr;
1067 
1068   PetscFunctionBegin;
1069   /* Analyze the matrix and create the transpose ... on the fly */
1070   if (!loTriFactorT && !upTriFactorT) {
1071     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1072     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1073     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1074   }
1075 
1076   /* Get the GPU pointers */
1077   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1078   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1079 
1080   /* First, solve U */
1081   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1082                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1083                         upTriFactorT->csrMat->values->data().get(),
1084                         upTriFactorT->csrMat->row_offsets->data().get(),
1085                         upTriFactorT->csrMat->column_indices->data().get(),
1086                         upTriFactorT->solveInfo,
1087                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1088 
1089   /* Then, solve L */
1090   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1091                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1092                         loTriFactorT->csrMat->values->data().get(),
1093                         loTriFactorT->csrMat->row_offsets->data().get(),
1094                         loTriFactorT->csrMat->column_indices->data().get(),
1095                         loTriFactorT->solveInfo,
1096                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1097 
1098   /* restore */
1099   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1100   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1101   ierr = WaitForGPU();CHKERRCUSP(ierr);
1102   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 #undef __FUNCT__
1107 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1108 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1109 {
1110   CUSPARRAY                         *xGPU,*bGPU;
1111   cusparseStatus_t                  stat;
1112   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1113   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1114   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1115   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1116   PetscErrorCode                    ierr;
1117 
1118   PetscFunctionBegin;
1119   /* Get the GPU pointers */
1120   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1121   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1122 
1123   /* First, reorder with the row permutation */
1124   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1125                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1126                xGPU->begin());
1127 
1128   /* Next, solve L */
1129   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1130                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1131                         loTriFactor->csrMat->values->data().get(),
1132                         loTriFactor->csrMat->row_offsets->data().get(),
1133                         loTriFactor->csrMat->column_indices->data().get(),
1134                         loTriFactor->solveInfo,
1135                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1136 
1137   /* Then, solve U */
1138   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1139                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1140                         upTriFactor->csrMat->values->data().get(),
1141                         upTriFactor->csrMat->row_offsets->data().get(),
1142                         upTriFactor->csrMat->column_indices->data().get(),
1143                         upTriFactor->solveInfo,
1144                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1145 
1146   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1147   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1148                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1149                tempGPU->begin());
1150 
1151   /* Copy the temporary to the full solution. */
1152   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1153 
1154   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1155   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1156   ierr = WaitForGPU();CHKERRCUSP(ierr);
1157   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 #undef __FUNCT__
1162 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1163 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1164 {
1165   CUSPARRAY                         *xGPU,*bGPU;
1166   cusparseStatus_t                  stat;
1167   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1168   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1169   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1170   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1171   PetscErrorCode                    ierr;
1172 
1173   PetscFunctionBegin;
1174   /* Get the GPU pointers */
1175   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1176   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1177 
1178   /* First, solve L */
1179   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1180                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1181                         loTriFactor->csrMat->values->data().get(),
1182                         loTriFactor->csrMat->row_offsets->data().get(),
1183                         loTriFactor->csrMat->column_indices->data().get(),
1184                         loTriFactor->solveInfo,
1185                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1186 
1187   /* Next, solve U */
1188   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1189                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1190                         upTriFactor->csrMat->values->data().get(),
1191                         upTriFactor->csrMat->row_offsets->data().get(),
1192                         upTriFactor->csrMat->column_indices->data().get(),
1193                         upTriFactor->solveInfo,
1194                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1195 
1196   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1197   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1198   ierr = WaitForGPU();CHKERRCUSP(ierr);
1199   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1200   PetscFunctionReturn(0);
1201 }
1202 
1203 #undef __FUNCT__
1204 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1205 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1206 {
1207 
1208   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1209   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1210   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1211   PetscInt                     m = A->rmap->n,*ii,*ridx;
1212   PetscErrorCode               ierr;
1213   cusparseStatus_t             stat;
1214   cudaError_t                  err;
1215 
1216   PetscFunctionBegin;
1217   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1218     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1219     if (matstruct) {
1220       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1221     }
1222     try {
1223       cusparsestruct->nonzerorow=0;
1224       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1225 
1226       if (a->compressedrow.use) {
1227         m    = a->compressedrow.nrows;
1228         ii   = a->compressedrow.i;
1229         ridx = a->compressedrow.rindex;
1230       } else {
1231         /* Forcing compressed row on the GPU */
1232         int k=0;
1233         ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr);
1234         ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr);
1235         ii[0]=0;
1236         for (int j = 0; j<m; j++) {
1237           if ((a->i[j+1]-a->i[j])>0) {
1238             ii[k]  = a->i[j];
1239             ridx[k]= j;
1240             k++;
1241           }
1242         }
1243         ii[cusparsestruct->nonzerorow] = a->nz;
1244         m = cusparsestruct->nonzerorow;
1245       }
1246 
1247       /* allocate space for the triangular factor information */
1248       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1249       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1250       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1251       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1252 
1253       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1254       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1255       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1256       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1257       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1258 
1259       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1260       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1261 /* set the matrix */
1262         CsrMatrix *matrix= new CsrMatrix;
1263         matrix->num_rows = m;
1264         matrix->num_cols = A->cmap->n;
1265         matrix->num_entries = a->nz;
1266         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1267         matrix->row_offsets->assign(ii, ii + m+1);
1268 
1269         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1270         matrix->column_indices->assign(a->j, a->j+a->nz);
1271 
1272         matrix->values = new THRUSTARRAY(a->nz);
1273         matrix->values->assign(a->a, a->a+a->nz);
1274 
1275 /* assign the pointer */
1276         matstruct->mat = matrix;
1277 
1278       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1279 #if CUDA_VERSION>=4020
1280         CsrMatrix *matrix= new CsrMatrix;
1281         matrix->num_rows = m;
1282         matrix->num_cols = A->cmap->n;
1283         matrix->num_entries = a->nz;
1284         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1285         matrix->row_offsets->assign(ii, ii + m+1);
1286 
1287         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1288         matrix->column_indices->assign(a->j, a->j+a->nz);
1289 
1290         matrix->values = new THRUSTARRAY(a->nz);
1291         matrix->values->assign(a->a, a->a+a->nz);
1292 
1293         cusparseHybMat_t hybMat;
1294         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1295         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1296           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1297         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1298                                 matstruct->descr, matrix->values->data().get(),
1299                                 matrix->row_offsets->data().get(),
1300                                 matrix->column_indices->data().get(),
1301                                 hybMat, 0, partition);CHKERRCUSP(stat);
1302         /* assign the pointer */
1303         matstruct->mat = hybMat;
1304 
1305         if (matrix) {
1306           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1307           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1308           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1309           delete (CsrMatrix*)matrix;
1310         }
1311 #endif
1312       }
1313 
1314       /* assign the compressed row indices */
1315       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1316       matstruct->cprowIndices->assign(ridx,ridx+m);
1317 
1318       /* assign the pointer */
1319       cusparsestruct->mat = matstruct;
1320 
1321       if (!a->compressedrow.use) {
1322         ierr = PetscFree(ii);CHKERRQ(ierr);
1323         ierr = PetscFree(ridx);CHKERRQ(ierr);
1324       }
1325       cusparsestruct->workVector = new THRUSTARRAY;
1326       cusparsestruct->workVector->resize(m);
1327     } catch(char *ex) {
1328       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1329     }
1330     ierr = WaitForGPU();CHKERRCUSP(ierr);
1331 
1332     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1333 
1334     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1335   }
1336   PetscFunctionReturn(0);
1337 }
1338 
1339 #undef __FUNCT__
1340 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1341 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1342 {
1343   PetscErrorCode ierr;
1344   PetscInt rbs,cbs;
1345 
1346   PetscFunctionBegin;
1347   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1348   if (right) {
1349     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1350     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1351     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1352     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1353     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1354   }
1355   if (left) {
1356     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1357     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1358     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1359     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1360     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1361   }
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 struct VecCUSPPlusEquals
1366 {
1367   template <typename Tuple>
1368   __host__ __device__
1369   void operator()(Tuple t)
1370   {
1371     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1372   }
1373 };
1374 
1375 #undef __FUNCT__
1376 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1377 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1378 {
1379   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1380   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1381   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1382   CUSPARRAY                    *xarray,*yarray;
1383   PetscErrorCode               ierr;
1384   cusparseStatus_t             stat;
1385 
1386   PetscFunctionBegin;
1387   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1388      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1389   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1390   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1391   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1392     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1393     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1394                              mat->num_rows, mat->num_cols, mat->num_entries,
1395                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1396                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1397                              yarray->data().get());CHKERRCUSP(stat);
1398   } else {
1399 #if CUDA_VERSION>=4020
1400     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1401     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1402                              matstruct->alpha, matstruct->descr, hybMat,
1403                              xarray->data().get(), matstruct->beta,
1404                              yarray->data().get());CHKERRCUSP(stat);
1405 #endif
1406   }
1407   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1408   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1409   if (!cusparsestruct->stream) {
1410     ierr = WaitForGPU();CHKERRCUSP(ierr);
1411   }
1412   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1413   PetscFunctionReturn(0);
1414 }
1415 
1416 #undef __FUNCT__
1417 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1418 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1419 {
1420   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1421   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1422   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1423   CUSPARRAY                    *xarray,*yarray;
1424   PetscErrorCode               ierr;
1425   cusparseStatus_t             stat;
1426 
1427   PetscFunctionBegin;
1428   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1429      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1430   if (!matstructT) {
1431     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1432     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1433   }
1434   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1435   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1436 
1437   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1438     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1439     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1440                              mat->num_rows, mat->num_cols,
1441                              mat->num_entries, matstructT->alpha, matstructT->descr,
1442                              mat->values->data().get(), mat->row_offsets->data().get(),
1443                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1444                              yarray->data().get());CHKERRCUSP(stat);
1445   } else {
1446 #if CUDA_VERSION>=4020
1447     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1448     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1449                              matstructT->alpha, matstructT->descr, hybMat,
1450                              xarray->data().get(), matstructT->beta,
1451                              yarray->data().get());CHKERRCUSP(stat);
1452 #endif
1453   }
1454   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1455   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1456   if (!cusparsestruct->stream) {
1457     ierr = WaitForGPU();CHKERRCUSP(ierr);
1458   }
1459   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1460   PetscFunctionReturn(0);
1461 }
1462 
1463 
1464 #undef __FUNCT__
1465 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1466 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1467 {
1468   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1469   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1470   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1471   CUSPARRAY                    *xarray,*yarray,*zarray;
1472   PetscErrorCode               ierr;
1473   cusparseStatus_t             stat;
1474 
1475   PetscFunctionBegin;
1476   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1477      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1478   try {
1479     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1480     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1481     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1482     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1483 
1484     /* multiply add */
1485     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1486       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1487     /* here we need to be careful to set the number of rows in the multiply to the
1488        number of compressed rows in the matrix ... which is equivalent to the
1489        size of the workVector */
1490       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1491                                mat->num_rows, mat->num_cols,
1492                                mat->num_entries, matstruct->alpha, matstruct->descr,
1493                                mat->values->data().get(), mat->row_offsets->data().get(),
1494                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1495                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1496     } else {
1497 #if CUDA_VERSION>=4020
1498       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1499       if (cusparsestruct->workVector->size()) {
1500         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1501             matstruct->alpha, matstruct->descr, hybMat,
1502             xarray->data().get(), matstruct->beta,
1503             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1504       }
1505 #endif
1506     }
1507 
1508     /* scatter the data from the temporary into the full vector with a += operation */
1509     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1510         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1511         VecCUSPPlusEquals());
1512     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1513     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1514     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1515 
1516   } catch(char *ex) {
1517     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1518   }
1519   ierr = WaitForGPU();CHKERRCUSP(ierr);
1520   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1521   PetscFunctionReturn(0);
1522 }
1523 
1524 #undef __FUNCT__
1525 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1526 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1527 {
1528   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1529   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1530   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1531   CUSPARRAY                    *xarray,*yarray,*zarray;
1532   PetscErrorCode               ierr;
1533   cusparseStatus_t             stat;
1534 
1535   PetscFunctionBegin;
1536   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1537      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1538   if (!matstructT) {
1539     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1540     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1541   }
1542 
1543   try {
1544     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1545     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1546     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1547     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1548 
1549     /* multiply add with matrix transpose */
1550     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1551       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1552       /* here we need to be careful to set the number of rows in the multiply to the
1553          number of compressed rows in the matrix ... which is equivalent to the
1554          size of the workVector */
1555       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1556                                mat->num_rows, mat->num_cols,
1557                                mat->num_entries, matstructT->alpha, matstructT->descr,
1558                                mat->values->data().get(), mat->row_offsets->data().get(),
1559                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1560                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1561     } else {
1562 #if CUDA_VERSION>=4020
1563       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1564       if (cusparsestruct->workVector->size()) {
1565         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1566             matstructT->alpha, matstructT->descr, hybMat,
1567             xarray->data().get(), matstructT->beta,
1568             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1569       }
1570 #endif
1571     }
1572 
1573     /* scatter the data from the temporary into the full vector with a += operation */
1574     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1575         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1576         VecCUSPPlusEquals());
1577 
1578     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1579     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1580     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1581 
1582   } catch(char *ex) {
1583     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1584   }
1585   ierr = WaitForGPU();CHKERRCUSP(ierr);
1586   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 #undef __FUNCT__
1591 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1592 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1593 {
1594   PetscErrorCode ierr;
1595 
1596   PetscFunctionBegin;
1597   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1598   if (A->factortype==MAT_FACTOR_NONE) {
1599     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1600   }
1601   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1602   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1603   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1604   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1605   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1606   PetscFunctionReturn(0);
1607 }
1608 
1609 /* --------------------------------------------------------------------------------*/
1610 #undef __FUNCT__
1611 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1612 /*@
1613    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1614    (the default parallel PETSc format). This matrix will ultimately pushed down
1615    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1616    assembly performance the user should preallocate the matrix storage by setting
1617    the parameter nz (or the array nnz).  By setting these parameters accurately,
1618    performance during matrix assembly can be increased by more than a factor of 50.
1619 
1620    Collective on MPI_Comm
1621 
1622    Input Parameters:
1623 +  comm - MPI communicator, set to PETSC_COMM_SELF
1624 .  m - number of rows
1625 .  n - number of columns
1626 .  nz - number of nonzeros per row (same for all rows)
1627 -  nnz - array containing the number of nonzeros in the various rows
1628          (possibly different for each row) or NULL
1629 
1630    Output Parameter:
1631 .  A - the matrix
1632 
1633    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1634    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1635    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1636 
1637    Notes:
1638    If nnz is given then nz is ignored
1639 
1640    The AIJ format (also called the Yale sparse matrix format or
1641    compressed row storage), is fully compatible with standard Fortran 77
1642    storage.  That is, the stored row and column indices can begin at
1643    either one (as in Fortran) or zero.  See the users' manual for details.
1644 
1645    Specify the preallocated storage with either nz or nnz (not both).
1646    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1647    allocation.  For large problems you MUST preallocate memory or you
1648    will get TERRIBLE performance, see the users' manual chapter on matrices.
1649 
1650    By default, this format uses inodes (identical nodes) when possible, to
1651    improve numerical efficiency of matrix-vector products and solves. We
1652    search for consecutive rows with the same nonzero structure, thereby
1653    reusing matrix information to achieve increased efficiency.
1654 
1655    Level: intermediate
1656 
1657 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1658 @*/
1659 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1660 {
1661   PetscErrorCode ierr;
1662 
1663   PetscFunctionBegin;
1664   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1665   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1666   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1667   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1668   PetscFunctionReturn(0);
1669 }
1670 
1671 #undef __FUNCT__
1672 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1673 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1674 {
1675   PetscErrorCode   ierr;
1676   cusparseStatus_t stat;
1677   cudaError_t      err;
1678   PetscFunctionBegin;
1679   if (A->factortype==MAT_FACTOR_NONE) {
1680     try {
1681       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1682         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1683         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1684         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1685 
1686         /* delete all the members in each of the data structures */
1687         if (matstruct) {
1688           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1689           if (matstruct->mat) {
1690             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1691 #if CUDA_VERSION>=4020
1692               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1693               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1694 #endif
1695             } else {
1696               CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1697               if (mat->values) delete (THRUSTARRAY*)mat->values;
1698               if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1699               if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1700               delete (CsrMatrix*)matstruct->mat;
1701             }
1702           }
1703           if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1704           if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1705           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1706           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1707         }
1708         if (matstructT) {
1709           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1710           if (matstructT->mat) {
1711             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1712 #if CUDA_VERSION>=4020
1713               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1714               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1715 #endif
1716             } else {
1717               CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1718               if (matT->values) delete (THRUSTARRAY*)matT->values;
1719               if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1720               if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1721               delete (CsrMatrix*)matstructT->mat;
1722             }
1723           }
1724           if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1725           if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1726           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1727           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1728         }
1729         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1730         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1731         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1732         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1733       } else {
1734         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1735         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1736         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1737       }
1738     } catch(char *ex) {
1739       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1740     }
1741   } else {
1742     /* The triangular factors */
1743     try {
1744       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1745       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1746       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1747       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1748       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1749 
1750       /* delete all the members in each of the data structures */
1751       if (loTriFactor) {
1752         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1753         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1754         if (loTriFactor->csrMat) {
1755           if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1756           if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1757           if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1758           delete (CsrMatrix*)loTriFactor->csrMat;
1759         }
1760         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1761       }
1762 
1763       if (upTriFactor) {
1764         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1765         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1766         if (upTriFactor->csrMat) {
1767           if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1768           if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1769           if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1770           delete (CsrMatrix*)upTriFactor->csrMat;
1771         }
1772         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1773       }
1774 
1775       if (loTriFactorT) {
1776         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1777         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1778         if (loTriFactorT->csrMat) {
1779           if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1780           if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1781           if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1782           delete (CsrMatrix*)loTriFactorT->csrMat;
1783         }
1784         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1785       }
1786 
1787       if (upTriFactorT) {
1788         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1789         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1790         if (upTriFactorT->csrMat) {
1791           if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1792           if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1793           if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1794           delete (CsrMatrix*)upTriFactorT->csrMat;
1795         }
1796         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1797       }
1798       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1799       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1800       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1801       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1802       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1803     } catch(char *ex) {
1804       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1805     }
1806   }
1807   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1808   A->spptr = 0;
1809 
1810   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1811   PetscFunctionReturn(0);
1812 }
1813 
1814 #undef __FUNCT__
1815 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1816 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1817 {
1818   PetscErrorCode ierr;
1819   cusparseStatus_t stat;
1820   cusparseHandle_t handle=0;
1821 
1822   PetscFunctionBegin;
1823   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1824   if (B->factortype==MAT_FACTOR_NONE) {
1825     /* you cannot check the inode.use flag here since the matrix was just created.
1826        now build a GPU matrix data structure */
1827     B->spptr = new Mat_SeqAIJCUSPARSE;
1828     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1829     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1830     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1831     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1832     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1833     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1834     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1835     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1836     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1837   } else {
1838     /* NEXT, set the pointers to the triangular factors */
1839     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1840     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1841     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1842     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1843     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1844     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1845     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1846     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1847     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1848     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1849     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1850     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1851   }
1852 
1853   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1854      default cusparse tri solve. Note the difference with the implementation in
1855      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1856   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1857 
1858   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1859   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1860   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1861   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1862   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1863   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1864   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1865   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1866 
1867   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1868 
1869   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1870 
1871   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 /*M
1876    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1877 
1878    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1879    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1880    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1881 
1882    Options Database Keys:
1883 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1884 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1885 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1886 
1887   Level: beginner
1888 
1889 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1890 M*/
1891