xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision efca3c55b02548817e185e5069a2acfe20fa4458)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include "petscconf.h"
7 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include "../src/vec/vec/impls/dvecimpl.h"
10 #include "petsc-private/vecimpl.h"
11 #undef VecType
12 #include "cusparsematimpl.h"
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 #undef __FUNCT__
35 #define __FUNCT__ "MatCUSPARSESetStream"
36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
37 {
38   cusparseStatus_t   stat;
39   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
40 
41   PetscFunctionBegin;
42   cusparsestruct->stream = stream;
43   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
44   PetscFunctionReturn(0);
45 }
46 
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCUSPARSESetHandle"
49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
50 {
51   cusparseStatus_t   stat;
52   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53 
54   PetscFunctionBegin;
55   if (cusparsestruct->handle)
56     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
57   cusparsestruct->handle = handle;
58   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
59   PetscFunctionReturn(0);
60 }
61 
62 #undef __FUNCT__
63 #define __FUNCT__ "MatCUSPARSEClearHandle"
64 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
65 {
66   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67   PetscFunctionBegin;
68   if (cusparsestruct->handle)
69     cusparsestruct->handle = 0;
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
76 {
77   PetscFunctionBegin;
78   *type = MATSOLVERCUSPARSE;
79   PetscFunctionReturn(0);
80 }
81 
82 /*MC
83   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88   algorithms are not recommended. This class does NOT support direct solver operations.
89 
90   Level: beginner
91 
92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
93 M*/
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 #undef __FUNCT__
123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129 #if CUDA_VERSION>=4020
130   switch (op) {
131   case MAT_CUSPARSE_MULT:
132     cusparsestruct->format = format;
133     break;
134   case MAT_CUSPARSE_ALL:
135     cusparsestruct->format = format;
136     break;
137   default:
138     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139   }
140 #else
141   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
142     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
143 #endif
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149    operation. Only the MatMult operation can use different GPU storage formats
150    for MPIAIJCUSPARSE matrices.
151    Not Collective
152 
153    Input Parameters:
154 +  A - Matrix of type SEQAIJCUSPARSE
155 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
157 
158    Output Parameter:
159 
160    Level: intermediate
161 
162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163 @*/
164 #undef __FUNCT__
165 #define __FUNCT__ "MatCUSPARSESetFormat"
166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
167 {
168   PetscErrorCode ierr;
169 
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
172   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
179 {
180   PetscErrorCode           ierr;
181   MatCUSPARSEStorageFormat format;
182   PetscBool                flg;
183 
184   PetscFunctionBegin;
185   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
186   ierr = PetscObjectOptionsBegin((PetscObject)A);
187   if (A->factortype==MAT_FACTOR_NONE) {
188     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
189                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190     if (flg) {
191       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
192     }
193   }
194   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
195                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
196   if (flg) {
197     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
198   }
199   ierr = PetscOptionsEnd();CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 
202 }
203 
204 #undef __FUNCT__
205 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
206 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
207 {
208   PetscErrorCode ierr;
209 
210   PetscFunctionBegin;
211   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
212   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
218 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
219 {
220   PetscErrorCode ierr;
221 
222   PetscFunctionBegin;
223   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
224   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
230 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
236   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
237   PetscFunctionReturn(0);
238 }
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
242 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
243 {
244   PetscErrorCode ierr;
245 
246   PetscFunctionBegin;
247   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
248   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
254 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
255 {
256   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
257   PetscInt                          n = A->rmap->n;
258   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
259   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
260   cusparseStatus_t                  stat;
261   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
262   const MatScalar                   *aa = a->a,*v;
263   PetscInt                          *AiLo, *AjLo;
264   PetscScalar                       *AALo;
265   PetscInt                          i,nz, nzLower, offset, rowOffset;
266   PetscErrorCode                    ierr;
267 
268   PetscFunctionBegin;
269   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
270     try {
271       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
272       nzLower=n+ai[n]-ai[1];
273 
274       /* Allocate Space for the lower triangular matrix */
275       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
276       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
277       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
278 
279       /* Fill the lower triangular matrix */
280       AiLo[0]  = (PetscInt) 0;
281       AiLo[n]  = nzLower;
282       AjLo[0]  = (PetscInt) 0;
283       AALo[0]  = (MatScalar) 1.0;
284       v        = aa;
285       vi       = aj;
286       offset   = 1;
287       rowOffset= 1;
288       for (i=1; i<n; i++) {
289         nz = ai[i+1] - ai[i];
290         /* additional 1 for the term on the diagonal */
291         AiLo[i]    = rowOffset;
292         rowOffset += nz+1;
293 
294         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
295         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
296 
297         offset      += nz;
298         AjLo[offset] = (PetscInt) i;
299         AALo[offset] = (MatScalar) 1.0;
300         offset      += 1;
301 
302         v  += nz;
303         vi += nz;
304       }
305 
306       /* allocate space for the triangular factor information */
307       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
308 
309       /* Create the matrix description */
310       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
311       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
312       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
313       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
314       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
315 
316       /* Create the solve analysis information */
317       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
318 
319       /* set the operation */
320       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
321 
322       /* set the matrix */
323       loTriFactor->csrMat = new CsrMatrix;
324       loTriFactor->csrMat->num_rows = n;
325       loTriFactor->csrMat->num_cols = n;
326       loTriFactor->csrMat->num_entries = nzLower;
327 
328       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
329       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
330 
331       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
332       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
333 
334       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
335       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
336 
337       /* perform the solve analysis */
338       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
339                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
340                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
341                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
342 
343       /* assign the pointer. Is this really necessary? */
344       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
345 
346       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
347       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
348       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
349     } catch(char *ex) {
350       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
351     }
352   }
353   PetscFunctionReturn(0);
354 }
355 
356 #undef __FUNCT__
357 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
358 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
359 {
360   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
361   PetscInt                          n = A->rmap->n;
362   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
363   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
364   cusparseStatus_t                  stat;
365   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
366   const MatScalar                   *aa = a->a,*v;
367   PetscInt                          *AiUp, *AjUp;
368   PetscScalar                       *AAUp;
369   PetscInt                          i,nz, nzUpper, offset;
370   PetscErrorCode                    ierr;
371 
372   PetscFunctionBegin;
373   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
374     try {
375       /* next, figure out the number of nonzeros in the upper triangular matrix. */
376       nzUpper = adiag[0]-adiag[n];
377 
378       /* Allocate Space for the upper triangular matrix */
379       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
380       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
381       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
382 
383       /* Fill the upper triangular matrix */
384       AiUp[0]=(PetscInt) 0;
385       AiUp[n]=nzUpper;
386       offset = nzUpper;
387       for (i=n-1; i>=0; i--) {
388         v  = aa + adiag[i+1] + 1;
389         vi = aj + adiag[i+1] + 1;
390 
391         /* number of elements NOT on the diagonal */
392         nz = adiag[i] - adiag[i+1]-1;
393 
394         /* decrement the offset */
395         offset -= (nz+1);
396 
397         /* first, set the diagonal elements */
398         AjUp[offset] = (PetscInt) i;
399         AAUp[offset] = 1./v[nz];
400         AiUp[i]      = AiUp[i+1] - (nz+1);
401 
402         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
403         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
404       }
405 
406       /* allocate space for the triangular factor information */
407       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
408 
409       /* Create the matrix description */
410       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
411       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
412       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
413       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
414       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
415 
416       /* Create the solve analysis information */
417       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
418 
419       /* set the operation */
420       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
421 
422       /* set the matrix */
423       upTriFactor->csrMat = new CsrMatrix;
424       upTriFactor->csrMat->num_rows = n;
425       upTriFactor->csrMat->num_cols = n;
426       upTriFactor->csrMat->num_entries = nzUpper;
427 
428       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
430 
431       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
432       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
433 
434       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
435       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
436 
437       /* perform the solve analysis */
438       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
439                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
440                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
441                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
442 
443       /* assign the pointer. Is this really necessary? */
444       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
445 
446       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
447       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
448       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
449     } catch(char *ex) {
450       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
451     }
452   }
453   PetscFunctionReturn(0);
454 }
455 
456 #undef __FUNCT__
457 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
458 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
459 {
460   PetscErrorCode               ierr;
461   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
462   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
463   IS                           isrow = a->row,iscol = a->icol;
464   PetscBool                    row_identity,col_identity;
465   const PetscInt               *r,*c;
466   PetscInt                     n = A->rmap->n;
467 
468   PetscFunctionBegin;
469   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
470   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
471 
472   cusparseTriFactors->workVector = new THRUSTARRAY;
473   cusparseTriFactors->workVector->resize(n);
474   cusparseTriFactors->nnz=a->nz;
475 
476   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
477   /*lower triangular indices */
478   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
479   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
480   if (!row_identity) {
481     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
482     cusparseTriFactors->rpermIndices->assign(r, r+n);
483   }
484   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
485 
486   /*upper triangular indices */
487   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
488   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
489   if (!col_identity) {
490     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
491     cusparseTriFactors->cpermIndices->assign(c, c+n);
492   }
493   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
494   PetscFunctionReturn(0);
495 }
496 
497 #undef __FUNCT__
498 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
499 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
500 {
501   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
502   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
503   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
504   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
505   cusparseStatus_t                  stat;
506   PetscErrorCode                    ierr;
507   PetscInt                          *AiUp, *AjUp;
508   PetscScalar                       *AAUp;
509   PetscScalar                       *AALo;
510   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
511   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
512   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
513   const MatScalar                   *aa = b->a,*v;
514 
515   PetscFunctionBegin;
516   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
517     try {
518       /* Allocate Space for the upper triangular matrix */
519       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
520       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
521       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
522       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
523 
524       /* Fill the upper triangular matrix */
525       AiUp[0]=(PetscInt) 0;
526       AiUp[n]=nzUpper;
527       offset = 0;
528       for (i=0; i<n; i++) {
529         /* set the pointers */
530         v  = aa + ai[i];
531         vj = aj + ai[i];
532         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
533 
534         /* first, set the diagonal elements */
535         AjUp[offset] = (PetscInt) i;
536         AAUp[offset] = 1.0/v[nz];
537         AiUp[i]      = offset;
538         AALo[offset] = 1.0/v[nz];
539 
540         offset+=1;
541         if (nz>0) {
542           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
543           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
544           for (j=offset; j<offset+nz; j++) {
545             AAUp[j] = -AAUp[j];
546             AALo[j] = AAUp[j]/v[nz];
547           }
548           offset+=nz;
549         }
550       }
551 
552       /* allocate space for the triangular factor information */
553       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
554 
555       /* Create the matrix description */
556       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
557       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
558       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
559       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
560       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
561 
562       /* Create the solve analysis information */
563       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
564 
565       /* set the operation */
566       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
567 
568       /* set the matrix */
569       upTriFactor->csrMat = new CsrMatrix;
570       upTriFactor->csrMat->num_rows = A->rmap->n;
571       upTriFactor->csrMat->num_cols = A->cmap->n;
572       upTriFactor->csrMat->num_entries = a->nz;
573 
574       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
575       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
576 
577       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
578       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
579 
580       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
581       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
582 
583       /* perform the solve analysis */
584       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
585                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
586                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
587                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
588 
589       /* assign the pointer. Is this really necessary? */
590       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
591 
592       /* allocate space for the triangular factor information */
593       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
594 
595       /* Create the matrix description */
596       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
597       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
598       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
599       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
600       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
601 
602       /* Create the solve analysis information */
603       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
604 
605       /* set the operation */
606       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
607 
608       /* set the matrix */
609       loTriFactor->csrMat = new CsrMatrix;
610       loTriFactor->csrMat->num_rows = A->rmap->n;
611       loTriFactor->csrMat->num_cols = A->cmap->n;
612       loTriFactor->csrMat->num_entries = a->nz;
613 
614       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
615       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
616 
617       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
618       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
619 
620       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
621       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
622 
623       /* perform the solve analysis */
624       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
625                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
626                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
627                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
628 
629       /* assign the pointer. Is this really necessary? */
630       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
631 
632       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
633       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
634       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
635       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
636       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
637     } catch(char *ex) {
638       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
639     }
640   }
641   PetscFunctionReturn(0);
642 }
643 
644 #undef __FUNCT__
645 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
646 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
647 {
648   PetscErrorCode               ierr;
649   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
650   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
651   IS                           ip = a->row;
652   const PetscInt               *rip;
653   PetscBool                    perm_identity;
654   PetscInt                     n = A->rmap->n;
655 
656   PetscFunctionBegin;
657   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
658   cusparseTriFactors->workVector = new THRUSTARRAY;
659   cusparseTriFactors->workVector->resize(n);
660   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
661 
662   /*lower triangular indices */
663   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
664   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
665   if (!perm_identity) {
666     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
667     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
668     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
669     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
670   }
671   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
672   PetscFunctionReturn(0);
673 }
674 
675 #undef __FUNCT__
676 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
677 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
678 {
679   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
680   IS             isrow = b->row,iscol = b->col;
681   PetscBool      row_identity,col_identity;
682   PetscErrorCode ierr;
683 
684   PetscFunctionBegin;
685   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
686   /* determine which version of MatSolve needs to be used. */
687   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
688   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
689   if (row_identity && col_identity) {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
692   } else {
693     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
694     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
695   }
696 
697   /* get the triangular factors */
698   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
705 {
706   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
707   IS             ip = b->row;
708   PetscBool      perm_identity;
709   PetscErrorCode ierr;
710 
711   PetscFunctionBegin;
712   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
713 
714   /* determine which version of MatSolve needs to be used. */
715   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
716   if (perm_identity) {
717     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
718     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
719   } else {
720     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
721     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
722   }
723 
724   /* get the triangular factors */
725   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 
729 #undef __FUNCT__
730 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
731 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
732 {
733   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
734   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
736   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
737   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
738   cusparseStatus_t                  stat;
739   cusparseIndexBase_t               indexBase;
740   cusparseMatrixType_t              matrixType;
741   cusparseFillMode_t                fillMode;
742   cusparseDiagType_t                diagType;
743 
744   PetscFunctionBegin;
745 
746   /*********************************************/
747   /* Now the Transpose of the Lower Tri Factor */
748   /*********************************************/
749 
750   /* allocate space for the transpose of the lower triangular factor */
751   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
752 
753   /* set the matrix descriptors of the lower triangular factor */
754   matrixType = cusparseGetMatType(loTriFactor->descr);
755   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
756   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
757     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
758   diagType = cusparseGetMatDiagType(loTriFactor->descr);
759 
760   /* Create the matrix description */
761   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
762   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
763   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
764   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
765   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
766 
767   /* Create the solve analysis information */
768   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
769 
770   /* set the operation */
771   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
772 
773   /* allocate GPU space for the CSC of the lower triangular factor*/
774   loTriFactorT->csrMat = new CsrMatrix;
775   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
776   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
777   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
778   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
779   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
780   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
781 
782   /* compute the transpose of the lower triangular factor, i.e. the CSC */
783   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
784                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
785                           loTriFactor->csrMat->values->data().get(),
786                           loTriFactor->csrMat->row_offsets->data().get(),
787                           loTriFactor->csrMat->column_indices->data().get(),
788                           loTriFactorT->csrMat->values->data().get(),
789                           loTriFactorT->csrMat->column_indices->data().get(),
790                           loTriFactorT->csrMat->row_offsets->data().get(),
791                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
792 
793   /* perform the solve analysis on the transposed matrix */
794   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
795                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
796                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
797                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
798                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
799 
800   /* assign the pointer. Is this really necessary? */
801   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
802 
803   /*********************************************/
804   /* Now the Transpose of the Upper Tri Factor */
805   /*********************************************/
806 
807   /* allocate space for the transpose of the upper triangular factor */
808   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
809 
810   /* set the matrix descriptors of the upper triangular factor */
811   matrixType = cusparseGetMatType(upTriFactor->descr);
812   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
813   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
814     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
815   diagType = cusparseGetMatDiagType(upTriFactor->descr);
816 
817   /* Create the matrix description */
818   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
819   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
820   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
821   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
822   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
823 
824   /* Create the solve analysis information */
825   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
826 
827   /* set the operation */
828   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
829 
830   /* allocate GPU space for the CSC of the upper triangular factor*/
831   upTriFactorT->csrMat = new CsrMatrix;
832   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
833   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
834   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
835   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
836   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
837   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
838 
839   /* compute the transpose of the upper triangular factor, i.e. the CSC */
840   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
841                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
842                           upTriFactor->csrMat->values->data().get(),
843                           upTriFactor->csrMat->row_offsets->data().get(),
844                           upTriFactor->csrMat->column_indices->data().get(),
845                           upTriFactorT->csrMat->values->data().get(),
846                           upTriFactorT->csrMat->column_indices->data().get(),
847                           upTriFactorT->csrMat->row_offsets->data().get(),
848                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
849 
850   /* perform the solve analysis on the transposed matrix */
851   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
852                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
853                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
854                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
855                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
856 
857   /* assign the pointer. Is this really necessary? */
858   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
859   PetscFunctionReturn(0);
860 }
861 
862 #undef __FUNCT__
863 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
865 {
866   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
867   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
868   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
869   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
870   cusparseStatus_t             stat;
871   cusparseIndexBase_t          indexBase;
872   cudaError_t                  err;
873 
874   PetscFunctionBegin;
875 
876   /* allocate space for the triangular factor information */
877   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
878   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
879   indexBase = cusparseGetMatIndexBase(matstruct->descr);
880   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
881   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
882 
883   /* set alpha and beta */
884   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
885   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
886   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
887   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
888   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
889 
890   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892     CsrMatrix *matrixT= new CsrMatrix;
893     matrixT->num_rows = A->rmap->n;
894     matrixT->num_cols = A->cmap->n;
895     matrixT->num_entries = a->nz;
896     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
897     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898     matrixT->values = new THRUSTARRAY(a->nz);
899 
900     /* compute the transpose of the upper triangular factor, i.e. the CSC */
901     indexBase = cusparseGetMatIndexBase(matstruct->descr);
902     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
903                             matrix->num_cols, matrix->num_entries,
904                             matrix->values->data().get(),
905                             matrix->row_offsets->data().get(),
906                             matrix->column_indices->data().get(),
907                             matrixT->values->data().get(),
908                             matrixT->column_indices->data().get(),
909                             matrixT->row_offsets->data().get(),
910                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
911 
912     /* assign the pointer */
913     matstructT->mat = matrixT;
914 
915   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
916 #if CUDA_VERSION>=5000
917     /* First convert HYB to CSR */
918     CsrMatrix *temp= new CsrMatrix;
919     temp->num_rows = A->rmap->n;
920     temp->num_cols = A->cmap->n;
921     temp->num_entries = a->nz;
922     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
923     temp->column_indices = new THRUSTINTARRAY32(a->nz);
924     temp->values = new THRUSTARRAY(a->nz);
925 
926 
927     stat = cusparse_hyb2csr(cusparsestruct->handle,
928                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
929                             temp->values->data().get(),
930                             temp->row_offsets->data().get(),
931                             temp->column_indices->data().get());CHKERRCUSP(stat);
932 
933     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
934     CsrMatrix *tempT= new CsrMatrix;
935     tempT->num_rows = A->rmap->n;
936     tempT->num_cols = A->cmap->n;
937     tempT->num_entries = a->nz;
938     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
940     tempT->values = new THRUSTARRAY(a->nz);
941 
942     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
943                             temp->num_cols, temp->num_entries,
944                             temp->values->data().get(),
945                             temp->row_offsets->data().get(),
946                             temp->column_indices->data().get(),
947                             tempT->values->data().get(),
948                             tempT->column_indices->data().get(),
949                             tempT->row_offsets->data().get(),
950                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
951 
952     /* Last, convert CSC to HYB */
953     cusparseHybMat_t hybMat;
954     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
955     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
956       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
957     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
958                             matstructT->descr, tempT->values->data().get(),
959                             tempT->row_offsets->data().get(),
960                             tempT->column_indices->data().get(),
961                             hybMat, 0, partition);CHKERRCUSP(stat);
962 
963     /* assign the pointer */
964     matstructT->mat = hybMat;
965 
966     /* delete temporaries */
967     if (tempT) {
968       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
969       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
970       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
971       delete (CsrMatrix*) tempT;
972     }
973     if (temp) {
974       if (temp->values) delete (THRUSTARRAY*) temp->values;
975       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
976       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
977       delete (CsrMatrix*) temp;
978     }
979 #else
980     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
981 #endif
982   }
983   /* assign the compressed row indices */
984   matstructT->cprowIndices = new THRUSTINTARRAY;
985 
986   /* assign the pointer */
987   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
988   PetscFunctionReturn(0);
989 }
990 
991 #undef __FUNCT__
992 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
993 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
994 {
995   CUSPARRAY                         *xGPU, *bGPU;
996   cusparseStatus_t                  stat;
997   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1000   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1001   PetscErrorCode                    ierr;
1002 
1003   PetscFunctionBegin;
1004   /* Analyze the matrix and create the transpose ... on the fly */
1005   if (!loTriFactorT && !upTriFactorT) {
1006     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1007     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1008     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1009   }
1010 
1011   /* Get the GPU pointers */
1012   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1013   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1014 
1015   /* First, reorder with the row permutation */
1016   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1017                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1018                xGPU->begin());
1019 
1020   /* First, solve U */
1021   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1022                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1023                         upTriFactorT->csrMat->values->data().get(),
1024                         upTriFactorT->csrMat->row_offsets->data().get(),
1025                         upTriFactorT->csrMat->column_indices->data().get(),
1026                         upTriFactorT->solveInfo,
1027                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1028 
1029   /* Then, solve L */
1030   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1031                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1032                         loTriFactorT->csrMat->values->data().get(),
1033                         loTriFactorT->csrMat->row_offsets->data().get(),
1034                         loTriFactorT->csrMat->column_indices->data().get(),
1035                         loTriFactorT->solveInfo,
1036                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1037 
1038   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1039   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1040                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1041                tempGPU->begin());
1042 
1043   /* Copy the temporary to the full solution. */
1044   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1045 
1046   /* restore */
1047   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1048   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1049   ierr = WaitForGPU();CHKERRCUSP(ierr);
1050 
1051   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 #undef __FUNCT__
1056 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058 {
1059   CUSPARRAY                         *xGPU,*bGPU;
1060   cusparseStatus_t                  stat;
1061   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1062   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1063   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1064   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1065   PetscErrorCode                    ierr;
1066 
1067   PetscFunctionBegin;
1068   /* Analyze the matrix and create the transpose ... on the fly */
1069   if (!loTriFactorT && !upTriFactorT) {
1070     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1071     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1072     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1073   }
1074 
1075   /* Get the GPU pointers */
1076   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1077   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1078 
1079   /* First, solve U */
1080   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1081                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1082                         upTriFactorT->csrMat->values->data().get(),
1083                         upTriFactorT->csrMat->row_offsets->data().get(),
1084                         upTriFactorT->csrMat->column_indices->data().get(),
1085                         upTriFactorT->solveInfo,
1086                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1087 
1088   /* Then, solve L */
1089   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1091                         loTriFactorT->csrMat->values->data().get(),
1092                         loTriFactorT->csrMat->row_offsets->data().get(),
1093                         loTriFactorT->csrMat->column_indices->data().get(),
1094                         loTriFactorT->solveInfo,
1095                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1096 
1097   /* restore */
1098   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1099   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1100   ierr = WaitForGPU();CHKERRCUSP(ierr);
1101   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 #undef __FUNCT__
1106 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1107 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1108 {
1109   CUSPARRAY                         *xGPU,*bGPU;
1110   cusparseStatus_t                  stat;
1111   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1112   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1113   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1114   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1115   PetscErrorCode                    ierr;
1116 
1117   PetscFunctionBegin;
1118   /* Get the GPU pointers */
1119   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1120   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1121 
1122   /* First, reorder with the row permutation */
1123   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1124                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1125                xGPU->begin());
1126 
1127   /* Next, solve L */
1128   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1129                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1130                         loTriFactor->csrMat->values->data().get(),
1131                         loTriFactor->csrMat->row_offsets->data().get(),
1132                         loTriFactor->csrMat->column_indices->data().get(),
1133                         loTriFactor->solveInfo,
1134                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1135 
1136   /* Then, solve U */
1137   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1138                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1139                         upTriFactor->csrMat->values->data().get(),
1140                         upTriFactor->csrMat->row_offsets->data().get(),
1141                         upTriFactor->csrMat->column_indices->data().get(),
1142                         upTriFactor->solveInfo,
1143                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1144 
1145   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1146   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1147                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1148                tempGPU->begin());
1149 
1150   /* Copy the temporary to the full solution. */
1151   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1152 
1153   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1154   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1155   ierr = WaitForGPU();CHKERRCUSP(ierr);
1156   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 #undef __FUNCT__
1161 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1162 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1163 {
1164   CUSPARRAY                         *xGPU,*bGPU;
1165   cusparseStatus_t                  stat;
1166   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1167   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1168   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1169   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1170   PetscErrorCode                    ierr;
1171 
1172   PetscFunctionBegin;
1173   /* Get the GPU pointers */
1174   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1175   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1176 
1177   /* First, solve L */
1178   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1179                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1180                         loTriFactor->csrMat->values->data().get(),
1181                         loTriFactor->csrMat->row_offsets->data().get(),
1182                         loTriFactor->csrMat->column_indices->data().get(),
1183                         loTriFactor->solveInfo,
1184                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1185 
1186   /* Next, solve U */
1187   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1188                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1189                         upTriFactor->csrMat->values->data().get(),
1190                         upTriFactor->csrMat->row_offsets->data().get(),
1191                         upTriFactor->csrMat->column_indices->data().get(),
1192                         upTriFactor->solveInfo,
1193                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1194 
1195   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1196   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1197   ierr = WaitForGPU();CHKERRCUSP(ierr);
1198   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 #undef __FUNCT__
1203 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1204 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1205 {
1206 
1207   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1209   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210   PetscInt                     m = A->rmap->n,*ii,*ridx;
1211   PetscErrorCode               ierr;
1212   cusparseStatus_t             stat;
1213   cudaError_t                  err;
1214 
1215   PetscFunctionBegin;
1216   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1217     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1218     if (matstruct) {
1219       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1220     }
1221     try {
1222       cusparsestruct->nonzerorow=0;
1223       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1224 
1225       if (a->compressedrow.use) {
1226         m    = a->compressedrow.nrows;
1227         ii   = a->compressedrow.i;
1228         ridx = a->compressedrow.rindex;
1229       } else {
1230         /* Forcing compressed row on the GPU */
1231         int k=0;
1232         ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
1233         ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
1234         ii[0]=0;
1235         for (int j = 0; j<m; j++) {
1236           if ((a->i[j+1]-a->i[j])>0) {
1237             ii[k]  = a->i[j];
1238             ridx[k]= j;
1239             k++;
1240           }
1241         }
1242         ii[cusparsestruct->nonzerorow] = a->nz;
1243         m = cusparsestruct->nonzerorow;
1244       }
1245 
1246       /* allocate space for the triangular factor information */
1247       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1248       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1249       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1250       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1251 
1252       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1253       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1254       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1255       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1256       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1257 
1258       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1259       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1260         /* set the matrix */
1261 	CsrMatrix *matrix= new CsrMatrix;
1262 	matrix->num_rows = m;
1263 	matrix->num_cols = A->cmap->n;
1264 	matrix->num_entries = a->nz;
1265 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1266 	matrix->row_offsets->assign(ii, ii + m+1);
1267 
1268 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1269 	matrix->column_indices->assign(a->j, a->j+a->nz);
1270 
1271 	matrix->values = new THRUSTARRAY(a->nz);
1272 	matrix->values->assign(a->a, a->a+a->nz);
1273 
1274         /* assign the pointer */
1275         matstruct->mat = matrix;
1276 
1277       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1278 #if CUDA_VERSION>=4020
1279 	CsrMatrix *matrix= new CsrMatrix;
1280 	matrix->num_rows = m;
1281 	matrix->num_cols = A->cmap->n;
1282 	matrix->num_entries = a->nz;
1283 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1284 	matrix->row_offsets->assign(ii, ii + m+1);
1285 
1286 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1287 	matrix->column_indices->assign(a->j, a->j+a->nz);
1288 
1289 	matrix->values = new THRUSTARRAY(a->nz);
1290 	matrix->values->assign(a->a, a->a+a->nz);
1291 
1292         cusparseHybMat_t hybMat;
1293         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1294         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1295           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1296         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1297                                 matstruct->descr, matrix->values->data().get(),
1298                                 matrix->row_offsets->data().get(),
1299                                 matrix->column_indices->data().get(),
1300                                 hybMat, 0, partition);CHKERRCUSP(stat);
1301         /* assign the pointer */
1302         matstruct->mat = hybMat;
1303 
1304 	if (matrix) {
1305 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1306 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1307 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1308 	  delete (CsrMatrix*)matrix;
1309 	}
1310 #endif
1311       }
1312 
1313       /* assign the compressed row indices */
1314       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1315       matstruct->cprowIndices->assign(ridx,ridx+m);
1316 
1317       /* assign the pointer */
1318       cusparsestruct->mat = matstruct;
1319 
1320       if (!a->compressedrow.use) {
1321         ierr = PetscFree(ii);CHKERRQ(ierr);
1322         ierr = PetscFree(ridx);CHKERRQ(ierr);
1323       }
1324       cusparsestruct->workVector = new THRUSTARRAY;
1325       cusparsestruct->workVector->resize(m);
1326     } catch(char *ex) {
1327       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1328     }
1329     ierr = WaitForGPU();CHKERRCUSP(ierr);
1330 
1331     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1332 
1333     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1334   }
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 #undef __FUNCT__
1339 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
1340 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1341 {
1342   PetscErrorCode ierr;
1343 
1344   PetscFunctionBegin;
1345   if (right) {
1346     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1347     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1348     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
1349     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1350     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1351   }
1352   if (left) {
1353     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1354     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1355     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
1356     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1357     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1358   }
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 struct VecCUSPPlusEquals
1363 {
1364   template <typename Tuple>
1365   __host__ __device__
1366   void operator()(Tuple t)
1367   {
1368     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1369   }
1370 };
1371 
1372 #undef __FUNCT__
1373 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1374 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1375 {
1376   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1377   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1378   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1379   CUSPARRAY                    *xarray,*yarray;
1380   PetscErrorCode               ierr;
1381   cusparseStatus_t             stat;
1382 
1383   PetscFunctionBegin;
1384   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1385      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1386   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1387   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1388   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1389     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1390     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1391                              mat->num_rows, mat->num_cols, mat->num_entries,
1392                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1393                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1394                              yarray->data().get());CHKERRCUSP(stat);
1395   } else {
1396 #if CUDA_VERSION>=4020
1397     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1398     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1399                              matstruct->alpha, matstruct->descr, hybMat,
1400                              xarray->data().get(), matstruct->beta,
1401                              yarray->data().get());CHKERRCUSP(stat);
1402 #endif
1403   }
1404   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1405   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1406   if (!cusparsestruct->stream) {
1407     ierr = WaitForGPU();CHKERRCUSP(ierr);
1408   }
1409   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1410   PetscFunctionReturn(0);
1411 }
1412 
1413 #undef __FUNCT__
1414 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1415 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1416 {
1417   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1418   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1419   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1420   CUSPARRAY                    *xarray,*yarray;
1421   PetscErrorCode               ierr;
1422   cusparseStatus_t             stat;
1423 
1424   PetscFunctionBegin;
1425   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1426      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1427   if (!matstructT) {
1428     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1429     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1430   }
1431   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1432   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1433 
1434   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1435     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1436     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1437                              mat->num_rows, mat->num_cols,
1438                              mat->num_entries, matstructT->alpha, matstructT->descr,
1439                              mat->values->data().get(), mat->row_offsets->data().get(),
1440                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1441                              yarray->data().get());CHKERRCUSP(stat);
1442   } else {
1443 #if CUDA_VERSION>=4020
1444     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1445     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1446                              matstructT->alpha, matstructT->descr, hybMat,
1447                              xarray->data().get(), matstructT->beta,
1448                              yarray->data().get());CHKERRCUSP(stat);
1449 #endif
1450   }
1451   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1452   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1453   if (!cusparsestruct->stream) {
1454     ierr = WaitForGPU();CHKERRCUSP(ierr);
1455   }
1456   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 
1461 #undef __FUNCT__
1462 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1463 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1464 {
1465   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1466   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1467   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1468   CUSPARRAY                    *xarray,*yarray,*zarray;
1469   PetscErrorCode               ierr;
1470   cusparseStatus_t             stat;
1471 
1472   PetscFunctionBegin;
1473   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1474      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1475   try {
1476     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1477     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1478     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1479     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1480 
1481     /* multiply add */
1482     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1483       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1484       /* here we need to be careful to set the number of rows in the multiply to the
1485 	 number of compressed rows in the matrix ... which is equivalent to the
1486 	 size of the workVector */
1487       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1488                                mat->num_rows, mat->num_cols,
1489                                mat->num_entries, matstruct->alpha, matstruct->descr,
1490                                mat->values->data().get(), mat->row_offsets->data().get(),
1491                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1492                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1493     } else {
1494 #if CUDA_VERSION>=4020
1495       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1496       if (cusparsestruct->workVector->size()) {
1497 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1498 				 matstruct->alpha, matstruct->descr, hybMat,
1499 				 xarray->data().get(), matstruct->beta,
1500 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1501       }
1502 #endif
1503     }
1504 
1505     /* scatter the data from the temporary into the full vector with a += operation */
1506     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1507 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1508 		     VecCUSPPlusEquals());
1509     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1510     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1511     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1512 
1513   } catch(char *ex) {
1514     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1515   }
1516   ierr = WaitForGPU();CHKERRCUSP(ierr);
1517   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 #undef __FUNCT__
1522 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1523 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1524 {
1525   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1526   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1527   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1528   CUSPARRAY                    *xarray,*yarray,*zarray;
1529   PetscErrorCode               ierr;
1530   cusparseStatus_t             stat;
1531 
1532   PetscFunctionBegin;
1533   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1534      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1535   if (!matstructT) {
1536     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1537     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1538   }
1539 
1540   try {
1541     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1542     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1543     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1544     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1545 
1546     /* multiply add with matrix transpose */
1547     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1548       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1549       /* here we need to be careful to set the number of rows in the multiply to the
1550 	 number of compressed rows in the matrix ... which is equivalent to the
1551 	 size of the workVector */
1552       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1553                                mat->num_rows, mat->num_cols,
1554 			       mat->num_entries, matstructT->alpha, matstructT->descr,
1555                                mat->values->data().get(), mat->row_offsets->data().get(),
1556                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1557                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1558     } else {
1559 #if CUDA_VERSION>=4020
1560       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1561       if (cusparsestruct->workVector->size()) {
1562 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1563 				 matstructT->alpha, matstructT->descr, hybMat,
1564 				 xarray->data().get(), matstructT->beta,
1565 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1566       }
1567 #endif
1568     }
1569 
1570     /* scatter the data from the temporary into the full vector with a += operation */
1571     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1572 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1573 		     VecCUSPPlusEquals());
1574 
1575     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1576     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1577     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1578 
1579   } catch(char *ex) {
1580     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1581   }
1582   ierr = WaitForGPU();CHKERRCUSP(ierr);
1583   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 #undef __FUNCT__
1588 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1589 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1590 {
1591   PetscErrorCode ierr;
1592 
1593   PetscFunctionBegin;
1594   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1595   if (A->factortype==MAT_FACTOR_NONE) {
1596     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1597   }
1598   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1599   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1600   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1601   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1602   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 /* --------------------------------------------------------------------------------*/
1607 #undef __FUNCT__
1608 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1609 /*@
1610    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1611    (the default parallel PETSc format). This matrix will ultimately pushed down
1612    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1613    assembly performance the user should preallocate the matrix storage by setting
1614    the parameter nz (or the array nnz).  By setting these parameters accurately,
1615    performance during matrix assembly can be increased by more than a factor of 50.
1616 
1617    Collective on MPI_Comm
1618 
1619    Input Parameters:
1620 +  comm - MPI communicator, set to PETSC_COMM_SELF
1621 .  m - number of rows
1622 .  n - number of columns
1623 .  nz - number of nonzeros per row (same for all rows)
1624 -  nnz - array containing the number of nonzeros in the various rows
1625          (possibly different for each row) or NULL
1626 
1627    Output Parameter:
1628 .  A - the matrix
1629 
1630    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1631    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1632    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1633 
1634    Notes:
1635    If nnz is given then nz is ignored
1636 
1637    The AIJ format (also called the Yale sparse matrix format or
1638    compressed row storage), is fully compatible with standard Fortran 77
1639    storage.  That is, the stored row and column indices can begin at
1640    either one (as in Fortran) or zero.  See the users' manual for details.
1641 
1642    Specify the preallocated storage with either nz or nnz (not both).
1643    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1644    allocation.  For large problems you MUST preallocate memory or you
1645    will get TERRIBLE performance, see the users' manual chapter on matrices.
1646 
1647    By default, this format uses inodes (identical nodes) when possible, to
1648    improve numerical efficiency of matrix-vector products and solves. We
1649    search for consecutive rows with the same nonzero structure, thereby
1650    reusing matrix information to achieve increased efficiency.
1651 
1652    Level: intermediate
1653 
1654 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1655 @*/
1656 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1657 {
1658   PetscErrorCode ierr;
1659 
1660   PetscFunctionBegin;
1661   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1662   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1663   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1664   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1665   PetscFunctionReturn(0);
1666 }
1667 
1668 #undef __FUNCT__
1669 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1670 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1671 {
1672   PetscErrorCode   ierr;
1673   cusparseStatus_t stat;
1674   cudaError_t      err;
1675   PetscFunctionBegin;
1676   if (A->factortype==MAT_FACTOR_NONE) {
1677     try {
1678       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1679         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1680         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1681         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1682 
1683         /* delete all the members in each of the data structures */
1684         if (matstruct) {
1685           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1686           if (matstruct->mat) {
1687             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1688 #if CUDA_VERSION>=4020
1689               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1690               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1691 #endif
1692             } else {
1693 	      CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1694 	      if (mat->values) delete (THRUSTARRAY*)mat->values;
1695 	      if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1696 	      if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1697               delete (CsrMatrix*)matstruct->mat;
1698             }
1699           }
1700 	  if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1701 	  if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1702           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1703           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1704         }
1705         if (matstructT) {
1706           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1707           if (matstructT->mat) {
1708             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1709 #if CUDA_VERSION>=4020
1710               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1711               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1712 #endif
1713             } else {
1714 	      CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1715 	      if (matT->values) delete (THRUSTARRAY*)matT->values;
1716 	      if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1717 	      if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1718               delete (CsrMatrix*)matstructT->mat;
1719             }
1720           }
1721 	  if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1722 	  if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1723           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1724           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1725         }
1726         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1727         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1728         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1729         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1730       } else {
1731         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1732         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1733         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1734       }
1735     } catch(char *ex) {
1736       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1737     }
1738   } else {
1739     /* The triangular factors */
1740     try {
1741       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1742       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1743       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1744       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1745       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1746 
1747       /* delete all the members in each of the data structures */
1748       if (loTriFactor) {
1749         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1750         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1751         if (loTriFactor->csrMat) {
1752 	  if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1753 	  if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1754 	  if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1755 	  delete (CsrMatrix*)loTriFactor->csrMat;
1756 	}
1757         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1758       }
1759 
1760       if (upTriFactor) {
1761         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1762         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1763         if (upTriFactor->csrMat) {
1764 	  if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1765 	  if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1766 	  if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1767 	  delete (CsrMatrix*)upTriFactor->csrMat;
1768 	}
1769         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1770       }
1771 
1772       if (loTriFactorT) {
1773         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1774         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1775         if (loTriFactorT->csrMat) {
1776 	  if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1777 	  if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1778 	  if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1779 	  delete (CsrMatrix*)loTriFactorT->csrMat;
1780 	}
1781         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1782       }
1783 
1784       if (upTriFactorT) {
1785         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1786         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1787         if (upTriFactorT->csrMat) {
1788 	  if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1789 	  if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1790 	  if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1791 	  delete (CsrMatrix*)upTriFactorT->csrMat;
1792 	}
1793         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1794       }
1795       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1796       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1797       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1798       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1799       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1800     } catch(char *ex) {
1801       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1802     }
1803   }
1804   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1805   A->spptr = 0;
1806 
1807   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1808   PetscFunctionReturn(0);
1809 }
1810 
1811 #undef __FUNCT__
1812 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1813 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1814 {
1815   PetscErrorCode ierr;
1816   cusparseStatus_t stat;
1817   cusparseHandle_t handle=0;
1818 
1819   PetscFunctionBegin;
1820   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1821   if (B->factortype==MAT_FACTOR_NONE) {
1822     /* you cannot check the inode.use flag here since the matrix was just created.
1823        now build a GPU matrix data structure */
1824     B->spptr = new Mat_SeqAIJCUSPARSE;
1825     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1826     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1827     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1828     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1829     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1830     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1831     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1832     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1833     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1834   } else {
1835     /* NEXT, set the pointers to the triangular factors */
1836     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1837     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1838     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1839     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1840     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1841     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1842     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1843     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1844     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1845     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1846     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1847     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1848   }
1849 
1850   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1851      default cusparse tri solve. Note the difference with the implementation in
1852      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1853   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1854 
1855   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1856   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1857   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1858   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1859   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1860   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1861   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1862   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1863 
1864   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1865 
1866   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1867 
1868   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1869   PetscFunctionReturn(0);
1870 }
1871 
1872 /*M
1873    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1874 
1875    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1876    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1877    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1878 
1879    Options Database Keys:
1880 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1881 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1882 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1883 
1884   Level: beginner
1885 
1886 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1887 M*/
1888