xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 51c6d536e9ff46f413aadec1bc7f8e85deed0c81)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
39 
40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
46 
47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48 {
49   cusparseStatus_t   stat;
50   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
51 
52   PetscFunctionBegin;
53   cusparsestruct->stream = stream;
54   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55   PetscFunctionReturn(0);
56 }
57 
58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59 {
60   cusparseStatus_t   stat;
61   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62 
63   PetscFunctionBegin;
64   if (cusparsestruct->handle != handle) {
65     if (cusparsestruct->handle) {
66       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
67     }
68     cusparsestruct->handle = handle;
69   }
70   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75 {
76   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
77 
78   PetscFunctionBegin;
79   if (cusparsestruct->handle) cusparsestruct->handle = 0;
80   PetscFunctionReturn(0);
81 }
82 
83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
129 {
130   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
131 
132   PetscFunctionBegin;
133   switch (op) {
134   case MAT_CUSPARSE_MULT:
135     cusparsestruct->format = format;
136     break;
137   case MAT_CUSPARSE_ALL:
138     cusparsestruct->format = format;
139     break;
140   default:
141     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
142   }
143   PetscFunctionReturn(0);
144 }
145 
146 /*@
147    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
148    operation. Only the MatMult operation can use different GPU storage formats
149    for MPIAIJCUSPARSE matrices.
150    Not Collective
151 
152    Input Parameters:
153 +  A - Matrix of type SEQAIJCUSPARSE
154 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
155 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
156 
157    Output Parameter:
158 
159    Level: intermediate
160 
161 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
162 @*/
163 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
164 {
165   PetscErrorCode ierr;
166 
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 /*@
174    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
175 
176    Collective on mat
177 
178    Input Parameters:
179 +  A - Matrix of type SEQAIJCUSPARSE
180 -  transgen - the boolean flag
181 
182    Level: intermediate
183 
184 .seealso: MATSEQAIJCUSPARSE
185 @*/
186 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
187 {
188   PetscErrorCode ierr;
189   PetscBool      flg;
190 
191   PetscFunctionBegin;
192   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
193   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
194   if (flg) {
195     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
196     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
197     cusp->transgen = transgen;
198   }
199   PetscFunctionReturn(0);
200 }
201 
202 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
203 {
204   PetscErrorCode           ierr;
205   MatCUSPARSEStorageFormat format;
206   PetscBool                flg;
207   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
208 
209   PetscFunctionBegin;
210   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
211   if (A->factortype == MAT_FACTOR_NONE) {
212     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",cusparsestruct->transgen,&cusparsestruct->transgen,NULL);CHKERRQ(ierr);
213     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
214                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
215     if (flg) {
216       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
217     }
218   }
219   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
220                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
221   if (flg) {
222     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
223   }
224   ierr = PetscOptionsTail();CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
229 {
230   PetscErrorCode ierr;
231 
232   PetscFunctionBegin;
233   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
234   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
244   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
254   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
255   PetscFunctionReturn(0);
256 }
257 
258 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259 {
260   PetscErrorCode ierr;
261 
262   PetscFunctionBegin;
263   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
264   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265   PetscFunctionReturn(0);
266 }
267 
268 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
269 {
270   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
271   PetscInt                          n = A->rmap->n;
272   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
273   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
274   cusparseStatus_t                  stat;
275   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
276   const MatScalar                   *aa = a->a,*v;
277   PetscInt                          *AiLo, *AjLo;
278   PetscScalar                       *AALo;
279   PetscInt                          i,nz, nzLower, offset, rowOffset;
280   PetscErrorCode                    ierr;
281   cudaError_t                       cerr;
282 
283   PetscFunctionBegin;
284   if (!n) PetscFunctionReturn(0);
285   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
286     try {
287       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
288       nzLower=n+ai[n]-ai[1];
289 
290       /* Allocate Space for the lower triangular matrix */
291       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
292       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
293       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
294 
295       /* Fill the lower triangular matrix */
296       AiLo[0]  = (PetscInt) 0;
297       AiLo[n]  = nzLower;
298       AjLo[0]  = (PetscInt) 0;
299       AALo[0]  = (MatScalar) 1.0;
300       v        = aa;
301       vi       = aj;
302       offset   = 1;
303       rowOffset= 1;
304       for (i=1; i<n; i++) {
305         nz = ai[i+1] - ai[i];
306         /* additional 1 for the term on the diagonal */
307         AiLo[i]    = rowOffset;
308         rowOffset += nz+1;
309 
310         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
311         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
312 
313         offset      += nz;
314         AjLo[offset] = (PetscInt) i;
315         AALo[offset] = (MatScalar) 1.0;
316         offset      += 1;
317 
318         v  += nz;
319         vi += nz;
320       }
321 
322       /* allocate space for the triangular factor information */
323       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
324 
325       /* Create the matrix description */
326       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
327       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
328       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
329       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
330       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
331 
332       /* Create the solve analysis information */
333       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
334 
335       /* set the operation */
336       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
337 
338       /* set the matrix */
339       loTriFactor->csrMat = new CsrMatrix;
340       loTriFactor->csrMat->num_rows = n;
341       loTriFactor->csrMat->num_cols = n;
342       loTriFactor->csrMat->num_entries = nzLower;
343 
344       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
345       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
346 
347       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
348       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
349 
350       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
351       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
352 
353       /* perform the solve analysis */
354       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
358 
359       /* assign the pointer. Is this really necessary? */
360       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
361 
362       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
363       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
364       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
365       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
366     } catch(char *ex) {
367       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
368     }
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
374 {
375   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
376   PetscInt                          n = A->rmap->n;
377   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
378   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
379   cusparseStatus_t                  stat;
380   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
381   const MatScalar                   *aa = a->a,*v;
382   PetscInt                          *AiUp, *AjUp;
383   PetscScalar                       *AAUp;
384   PetscInt                          i,nz, nzUpper, offset;
385   PetscErrorCode                    ierr;
386   cudaError_t                       cerr;
387 
388   PetscFunctionBegin;
389   if (!n) PetscFunctionReturn(0);
390   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
391     try {
392       /* next, figure out the number of nonzeros in the upper triangular matrix. */
393       nzUpper = adiag[0]-adiag[n];
394 
395       /* Allocate Space for the upper triangular matrix */
396       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
397       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
398       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
399 
400       /* Fill the upper triangular matrix */
401       AiUp[0]=(PetscInt) 0;
402       AiUp[n]=nzUpper;
403       offset = nzUpper;
404       for (i=n-1; i>=0; i--) {
405         v  = aa + adiag[i+1] + 1;
406         vi = aj + adiag[i+1] + 1;
407 
408         /* number of elements NOT on the diagonal */
409         nz = adiag[i] - adiag[i+1]-1;
410 
411         /* decrement the offset */
412         offset -= (nz+1);
413 
414         /* first, set the diagonal elements */
415         AjUp[offset] = (PetscInt) i;
416         AAUp[offset] = (MatScalar)1./v[nz];
417         AiUp[i]      = AiUp[i+1] - (nz+1);
418 
419         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
420         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
421       }
422 
423       /* allocate space for the triangular factor information */
424       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
425 
426       /* Create the matrix description */
427       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
428       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
429       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
430       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
431       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
432 
433       /* Create the solve analysis information */
434       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
435 
436       /* set the operation */
437       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438 
439       /* set the matrix */
440       upTriFactor->csrMat = new CsrMatrix;
441       upTriFactor->csrMat->num_rows = n;
442       upTriFactor->csrMat->num_cols = n;
443       upTriFactor->csrMat->num_entries = nzUpper;
444 
445       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
447 
448       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
449       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
450 
451       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
452       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
453 
454       /* perform the solve analysis */
455       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
456                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
457                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
458                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459 
460       /* assign the pointer. Is this really necessary? */
461       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
462 
463       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
464       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
465       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
466       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
467     } catch(char *ex) {
468       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
469     }
470   }
471   PetscFunctionReturn(0);
472 }
473 
474 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
475 {
476   PetscErrorCode               ierr;
477   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
478   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   IS                           isrow = a->row,iscol = a->icol;
480   PetscBool                    row_identity,col_identity;
481   const PetscInt               *r,*c;
482   PetscInt                     n = A->rmap->n;
483 
484   PetscFunctionBegin;
485   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
486   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
487   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
488 
489   cusparseTriFactors->workVector = new THRUSTARRAY(n);
490   cusparseTriFactors->nnz=a->nz;
491 
492   A->offloadmask = PETSC_OFFLOAD_BOTH;
493   /* lower triangular indices */
494   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
495   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
496   if (!row_identity) {
497     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
498     cusparseTriFactors->rpermIndices->assign(r, r+n);
499   }
500   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
501 
502   /* upper triangular indices */
503   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
504   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
505   if (!col_identity) {
506     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
507     cusparseTriFactors->cpermIndices->assign(c, c+n);
508   }
509 
510   if (!row_identity && !col_identity) {
511     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
512   } else if(!row_identity) {
513     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
514   } else if(!col_identity) {
515     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
516   }
517 
518   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
519   PetscFunctionReturn(0);
520 }
521 
522 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
523 {
524   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
525   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
526   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
527   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
528   cusparseStatus_t                  stat;
529   PetscErrorCode                    ierr;
530   cudaError_t                       cerr;
531   PetscInt                          *AiUp, *AjUp;
532   PetscScalar                       *AAUp;
533   PetscScalar                       *AALo;
534   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
535   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
536   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
537   const MatScalar                   *aa = b->a,*v;
538 
539   PetscFunctionBegin;
540   if (!n) PetscFunctionReturn(0);
541   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
542     try {
543       /* Allocate Space for the upper triangular matrix */
544       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
545       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
546       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
547       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
548 
549       /* Fill the upper triangular matrix */
550       AiUp[0]=(PetscInt) 0;
551       AiUp[n]=nzUpper;
552       offset = 0;
553       for (i=0; i<n; i++) {
554         /* set the pointers */
555         v  = aa + ai[i];
556         vj = aj + ai[i];
557         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
558 
559         /* first, set the diagonal elements */
560         AjUp[offset] = (PetscInt) i;
561         AAUp[offset] = (MatScalar)1.0/v[nz];
562         AiUp[i]      = offset;
563         AALo[offset] = (MatScalar)1.0/v[nz];
564 
565         offset+=1;
566         if (nz>0) {
567           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
568           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
569           for (j=offset; j<offset+nz; j++) {
570             AAUp[j] = -AAUp[j];
571             AALo[j] = AAUp[j]/v[nz];
572           }
573           offset+=nz;
574         }
575       }
576 
577       /* allocate space for the triangular factor information */
578       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
579 
580       /* Create the matrix description */
581       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
582       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
583       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
584       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
585       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
586 
587       /* Create the solve analysis information */
588       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
589 
590       /* set the operation */
591       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
592 
593       /* set the matrix */
594       upTriFactor->csrMat = new CsrMatrix;
595       upTriFactor->csrMat->num_rows = A->rmap->n;
596       upTriFactor->csrMat->num_cols = A->cmap->n;
597       upTriFactor->csrMat->num_entries = a->nz;
598 
599       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
600       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
601 
602       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
603       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
604 
605       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
606       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
607 
608       /* perform the solve analysis */
609       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
613 
614       /* assign the pointer. Is this really necessary? */
615       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
616 
617       /* allocate space for the triangular factor information */
618       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
619 
620       /* Create the matrix description */
621       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
622       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
623       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
624       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
625       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
626 
627       /* Create the solve analysis information */
628       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
629 
630       /* set the operation */
631       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
632 
633       /* set the matrix */
634       loTriFactor->csrMat = new CsrMatrix;
635       loTriFactor->csrMat->num_rows = A->rmap->n;
636       loTriFactor->csrMat->num_cols = A->cmap->n;
637       loTriFactor->csrMat->num_entries = a->nz;
638 
639       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
640       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
641 
642       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
643       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
644 
645       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
646       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
647       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
648 
649       /* perform the solve analysis */
650       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
651                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
652                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
653                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
654 
655       /* assign the pointer. Is this really necessary? */
656       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
657 
658       A->offloadmask = PETSC_OFFLOAD_BOTH;
659       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
660       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
661       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
662       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
663     } catch(char *ex) {
664       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
665     }
666   }
667   PetscFunctionReturn(0);
668 }
669 
670 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
671 {
672   PetscErrorCode               ierr;
673   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
674   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
675   IS                           ip = a->row;
676   const PetscInt               *rip;
677   PetscBool                    perm_identity;
678   PetscInt                     n = A->rmap->n;
679 
680   PetscFunctionBegin;
681   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
682   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
683   cusparseTriFactors->workVector = new THRUSTARRAY(n);
684   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
685 
686   /* lower triangular indices */
687   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
688   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
689   if (!perm_identity) {
690     IS             iip;
691     const PetscInt *irip;
692 
693     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
694     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
695     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
696     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
697     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
699     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
700     ierr = ISDestroy(&iip);CHKERRQ(ierr);
701     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
702   }
703   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
707 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
708 {
709   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
710   IS             isrow = b->row,iscol = b->col;
711   PetscBool      row_identity,col_identity;
712   PetscErrorCode ierr;
713 
714   PetscFunctionBegin;
715   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
716   B->offloadmask = PETSC_OFFLOAD_CPU;
717   /* determine which version of MatSolve needs to be used. */
718   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
719   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
720   if (row_identity && col_identity) {
721     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
722     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
723     B->ops->matsolve = NULL;
724     B->ops->matsolvetranspose = NULL;
725   } else {
726     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
727     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
728     B->ops->matsolve = NULL;
729     B->ops->matsolvetranspose = NULL;
730   }
731 
732   /* get the triangular factors */
733   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
738 {
739   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
740   IS             ip = b->row;
741   PetscBool      perm_identity;
742   PetscErrorCode ierr;
743 
744   PetscFunctionBegin;
745   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
746   B->offloadmask = PETSC_OFFLOAD_CPU;
747   /* determine which version of MatSolve needs to be used. */
748   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
749   if (perm_identity) {
750     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
751     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
752     B->ops->matsolve = NULL;
753     B->ops->matsolvetranspose = NULL;
754   } else {
755     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
756     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
757     B->ops->matsolve = NULL;
758     B->ops->matsolvetranspose = NULL;
759   }
760 
761   /* get the triangular factors */
762   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
763   PetscFunctionReturn(0);
764 }
765 
766 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
767 {
768   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
769   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
770   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
771   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
772   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
773   cusparseStatus_t                  stat;
774   cusparseIndexBase_t               indexBase;
775   cusparseMatrixType_t              matrixType;
776   cusparseFillMode_t                fillMode;
777   cusparseDiagType_t                diagType;
778 
779   PetscFunctionBegin;
780 
781   /*********************************************/
782   /* Now the Transpose of the Lower Tri Factor */
783   /*********************************************/
784 
785   /* allocate space for the transpose of the lower triangular factor */
786   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
787 
788   /* set the matrix descriptors of the lower triangular factor */
789   matrixType = cusparseGetMatType(loTriFactor->descr);
790   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
791   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
792     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
793   diagType = cusparseGetMatDiagType(loTriFactor->descr);
794 
795   /* Create the matrix description */
796   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
797   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
798   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
799   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
800   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
801 
802   /* Create the solve analysis information */
803   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
804 
805   /* set the operation */
806   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
807 
808   /* allocate GPU space for the CSC of the lower triangular factor*/
809   loTriFactorT->csrMat = new CsrMatrix;
810   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
811   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
812   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
813   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
814   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
815   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
816 
817   /* compute the transpose of the lower triangular factor, i.e. the CSC */
818   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
819                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
820                           loTriFactor->csrMat->values->data().get(),
821                           loTriFactor->csrMat->row_offsets->data().get(),
822                           loTriFactor->csrMat->column_indices->data().get(),
823                           loTriFactorT->csrMat->values->data().get(),
824                           loTriFactorT->csrMat->column_indices->data().get(),
825                           loTriFactorT->csrMat->row_offsets->data().get(),
826                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
827 
828   /* perform the solve analysis on the transposed matrix */
829   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
830                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
831                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
832                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
833                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
834 
835   /* assign the pointer. Is this really necessary? */
836   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
837 
838   /*********************************************/
839   /* Now the Transpose of the Upper Tri Factor */
840   /*********************************************/
841 
842   /* allocate space for the transpose of the upper triangular factor */
843   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
844 
845   /* set the matrix descriptors of the upper triangular factor */
846   matrixType = cusparseGetMatType(upTriFactor->descr);
847   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
848   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
849     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
850   diagType = cusparseGetMatDiagType(upTriFactor->descr);
851 
852   /* Create the matrix description */
853   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
854   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
855   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
856   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
857   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
858 
859   /* Create the solve analysis information */
860   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
861 
862   /* set the operation */
863   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
864 
865   /* allocate GPU space for the CSC of the upper triangular factor*/
866   upTriFactorT->csrMat = new CsrMatrix;
867   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
868   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
869   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
870   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
871   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
872   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
873 
874   /* compute the transpose of the upper triangular factor, i.e. the CSC */
875   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
876                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
877                           upTriFactor->csrMat->values->data().get(),
878                           upTriFactor->csrMat->row_offsets->data().get(),
879                           upTriFactor->csrMat->column_indices->data().get(),
880                           upTriFactorT->csrMat->values->data().get(),
881                           upTriFactorT->csrMat->column_indices->data().get(),
882                           upTriFactorT->csrMat->row_offsets->data().get(),
883                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
884 
885   /* perform the solve analysis on the transposed matrix */
886   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
887                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
888                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
889                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
890                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
891 
892   /* assign the pointer. Is this really necessary? */
893   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
894   PetscFunctionReturn(0);
895 }
896 
897 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
898 {
899   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
900   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
901   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
902   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
903   cusparseStatus_t             stat;
904   cusparseIndexBase_t          indexBase;
905   cudaError_t                  err;
906 
907   PetscFunctionBegin;
908   if (!cusparsestruct->transgen) PetscFunctionReturn(0);
909   /* allocate space for the triangular factor information */
910   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
911   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
912   indexBase = cusparseGetMatIndexBase(matstruct->descr);
913   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
914   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
915 
916   /* set alpha and beta */
917   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
918   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
919   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
920   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
921   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
922   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
923   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
924 
925   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
926     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
927     CsrMatrix *matrixT= new CsrMatrix;
928     matrixT->num_rows = A->cmap->n;
929     matrixT->num_cols = A->rmap->n;
930     matrixT->num_entries = a->nz;
931     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
932     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
933     matrixT->values = new THRUSTARRAY(a->nz);
934 
935     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
936     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
937     /* compute the transpose, i.e. the CSC */
938     indexBase = cusparseGetMatIndexBase(matstruct->descr);
939     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
940                             A->cmap->n, matrix->num_entries,
941                             matrix->values->data().get(),
942                             cusparsestruct->rowoffsets_gpu->data().get(),
943                             matrix->column_indices->data().get(),
944                             matrixT->values->data().get(),
945                             matrixT->column_indices->data().get(),
946                             matrixT->row_offsets->data().get(),
947                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
948     matstructT->mat = matrixT;
949   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
950     CsrMatrix *temp = new CsrMatrix;
951     CsrMatrix *tempT = new CsrMatrix;
952 
953     /* First convert HYB to CSR */
954     temp->num_rows = A->rmap->n;
955     temp->num_cols = A->cmap->n;
956     temp->num_entries = a->nz;
957     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
958     temp->column_indices = new THRUSTINTARRAY32(a->nz);
959     temp->values = new THRUSTARRAY(a->nz);
960 
961 
962     stat = cusparse_hyb2csr(cusparsestruct->handle,
963                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
964                             temp->values->data().get(),
965                             temp->row_offsets->data().get(),
966                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
967 
968     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
969     tempT->num_rows = A->rmap->n;
970     tempT->num_cols = A->cmap->n;
971     tempT->num_entries = a->nz;
972     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
973     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
974     tempT->values = new THRUSTARRAY(a->nz);
975 
976     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
977                             temp->num_cols, temp->num_entries,
978                             temp->values->data().get(),
979                             temp->row_offsets->data().get(),
980                             temp->column_indices->data().get(),
981                             tempT->values->data().get(),
982                             tempT->column_indices->data().get(),
983                             tempT->row_offsets->data().get(),
984                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
985 
986     /* Last, convert CSC to HYB */
987     cusparseHybMat_t hybMat;
988     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
989     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
990       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
991     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
992                             matstructT->descr, tempT->values->data().get(),
993                             tempT->row_offsets->data().get(),
994                             tempT->column_indices->data().get(),
995                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
996 
997     /* assign the pointer */
998     matstructT->mat = hybMat;
999     /* delete temporaries */
1000     if (tempT) {
1001       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1002       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1003       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1004       delete (CsrMatrix*) tempT;
1005     }
1006     if (temp) {
1007       if (temp->values) delete (THRUSTARRAY*) temp->values;
1008       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1009       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1010       delete (CsrMatrix*) temp;
1011     }
1012   }
1013   /* the compressed row indices is not used for matTranspose */
1014   matstructT->cprowIndices = NULL;
1015   /* assign the pointer */
1016   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1017   PetscFunctionReturn(0);
1018 }
1019 
1020 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1021 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1022 {
1023   PetscInt                              n = xx->map->n;
1024   const PetscScalar                     *barray;
1025   PetscScalar                           *xarray;
1026   thrust::device_ptr<const PetscScalar> bGPU;
1027   thrust::device_ptr<PetscScalar>       xGPU;
1028   cusparseStatus_t                      stat;
1029   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1030   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1031   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1032   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1033   PetscErrorCode                        ierr;
1034   cudaError_t                           cerr;
1035 
1036   PetscFunctionBegin;
1037   /* Analyze the matrix and create the transpose ... on the fly */
1038   if (!loTriFactorT && !upTriFactorT) {
1039     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1040     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1041     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1042   }
1043 
1044   /* Get the GPU pointers */
1045   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1046   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1047   xGPU = thrust::device_pointer_cast(xarray);
1048   bGPU = thrust::device_pointer_cast(barray);
1049 
1050   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1051   /* First, reorder with the row permutation */
1052   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1053                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1054                xGPU);
1055 
1056   /* First, solve U */
1057   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1058                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1059                         upTriFactorT->csrMat->values->data().get(),
1060                         upTriFactorT->csrMat->row_offsets->data().get(),
1061                         upTriFactorT->csrMat->column_indices->data().get(),
1062                         upTriFactorT->solveInfo,
1063                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1064 
1065   /* Then, solve L */
1066   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1068                         loTriFactorT->csrMat->values->data().get(),
1069                         loTriFactorT->csrMat->row_offsets->data().get(),
1070                         loTriFactorT->csrMat->column_indices->data().get(),
1071                         loTriFactorT->solveInfo,
1072                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1073 
1074   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1075   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1076                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1077                tempGPU->begin());
1078 
1079   /* Copy the temporary to the full solution. */
1080   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1081 
1082   /* restore */
1083   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1084   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1085   cerr = WaitForGPU();CHKERRCUDA(cerr);
1086   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1087   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1088   PetscFunctionReturn(0);
1089 }
1090 
1091 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1092 {
1093   const PetscScalar                 *barray;
1094   PetscScalar                       *xarray;
1095   cusparseStatus_t                  stat;
1096   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1097   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1098   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1099   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1100   PetscErrorCode                    ierr;
1101   cudaError_t                       cerr;
1102 
1103   PetscFunctionBegin;
1104   /* Analyze the matrix and create the transpose ... on the fly */
1105   if (!loTriFactorT && !upTriFactorT) {
1106     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1107     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1108     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1109   }
1110 
1111   /* Get the GPU pointers */
1112   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1113   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1114 
1115   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1116   /* First, solve U */
1117   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1118                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1119                         upTriFactorT->csrMat->values->data().get(),
1120                         upTriFactorT->csrMat->row_offsets->data().get(),
1121                         upTriFactorT->csrMat->column_indices->data().get(),
1122                         upTriFactorT->solveInfo,
1123                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1124 
1125   /* Then, solve L */
1126   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1127                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1128                         loTriFactorT->csrMat->values->data().get(),
1129                         loTriFactorT->csrMat->row_offsets->data().get(),
1130                         loTriFactorT->csrMat->column_indices->data().get(),
1131                         loTriFactorT->solveInfo,
1132                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1133 
1134   /* restore */
1135   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1136   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1137   cerr = WaitForGPU();CHKERRCUDA(cerr);
1138   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1139   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1140   PetscFunctionReturn(0);
1141 }
1142 
1143 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1144 {
1145   const PetscScalar                     *barray;
1146   PetscScalar                           *xarray;
1147   thrust::device_ptr<const PetscScalar> bGPU;
1148   thrust::device_ptr<PetscScalar>       xGPU;
1149   cusparseStatus_t                      stat;
1150   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1151   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1152   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1153   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1154   PetscErrorCode                        ierr;
1155   cudaError_t                           cerr;
1156 
1157   PetscFunctionBegin;
1158 
1159   /* Get the GPU pointers */
1160   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1161   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1162   xGPU = thrust::device_pointer_cast(xarray);
1163   bGPU = thrust::device_pointer_cast(barray);
1164 
1165   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1166   /* First, reorder with the row permutation */
1167   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1168                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1169                tempGPU->begin());
1170 
1171   /* Next, solve L */
1172   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1173                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1174                         loTriFactor->csrMat->values->data().get(),
1175                         loTriFactor->csrMat->row_offsets->data().get(),
1176                         loTriFactor->csrMat->column_indices->data().get(),
1177                         loTriFactor->solveInfo,
1178                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1179 
1180   /* Then, solve U */
1181   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1182                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1183                         upTriFactor->csrMat->values->data().get(),
1184                         upTriFactor->csrMat->row_offsets->data().get(),
1185                         upTriFactor->csrMat->column_indices->data().get(),
1186                         upTriFactor->solveInfo,
1187                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1188 
1189   /* Last, reorder with the column permutation */
1190   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1191                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1192                xGPU);
1193 
1194   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1195   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1196   cerr = WaitForGPU();CHKERRCUDA(cerr);
1197   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1198   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1203 {
1204   const PetscScalar                 *barray;
1205   PetscScalar                       *xarray;
1206   cusparseStatus_t                  stat;
1207   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1208   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1209   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1210   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1211   PetscErrorCode                    ierr;
1212   cudaError_t                       cerr;
1213 
1214   PetscFunctionBegin;
1215   /* Get the GPU pointers */
1216   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1217   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1218 
1219   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1220   /* First, solve L */
1221   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1222                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1223                         loTriFactor->csrMat->values->data().get(),
1224                         loTriFactor->csrMat->row_offsets->data().get(),
1225                         loTriFactor->csrMat->column_indices->data().get(),
1226                         loTriFactor->solveInfo,
1227                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1228 
1229   /* Next, solve U */
1230   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1231                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1232                         upTriFactor->csrMat->values->data().get(),
1233                         upTriFactor->csrMat->row_offsets->data().get(),
1234                         upTriFactor->csrMat->column_indices->data().get(),
1235                         upTriFactor->solveInfo,
1236                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1237 
1238   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1239   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1240   cerr = WaitForGPU();CHKERRCUDA(cerr);
1241   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1242   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1247 {
1248   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1249   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1250   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1251   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1252   PetscErrorCode               ierr;
1253   cusparseStatus_t             stat;
1254   cudaError_t                  err;
1255 
1256   PetscFunctionBegin;
1257   if (A->boundtocpu) PetscFunctionReturn(0);
1258   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1259     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1260     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1261       /* Copy values only */
1262       CsrMatrix *mat,*matT;
1263       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1264       mat->values->assign(a->a, a->a+a->nz);
1265       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1266 
1267       /* Update matT when it was built before */
1268       if (cusparsestruct->matTranspose) {
1269         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1270         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1271         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1272                                  A->cmap->n, mat->num_entries,
1273                                  mat->values->data().get(),
1274                                  cusparsestruct->rowoffsets_gpu->data().get(),
1275                                  mat->column_indices->data().get(),
1276                                  matT->values->data().get(),
1277                                  matT->column_indices->data().get(),
1278                                  matT->row_offsets->data().get(),
1279                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1280       }
1281     } else {
1282       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1283       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1284       delete cusparsestruct->workVector;
1285       delete cusparsestruct->rowoffsets_gpu;
1286       try {
1287         if (a->compressedrow.use) {
1288           m    = a->compressedrow.nrows;
1289           ii   = a->compressedrow.i;
1290           ridx = a->compressedrow.rindex;
1291         } else {
1292           m    = A->rmap->n;
1293           ii   = a->i;
1294           ridx = NULL;
1295         }
1296         cusparsestruct->nrows = m;
1297 
1298         /* allocate space for the triangular factor information */
1299         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1300         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1301         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1302         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1303 
1304         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1305         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1306         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1307         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1308         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1309         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1310         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1311 
1312         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1313         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1314           /* set the matrix */
1315           CsrMatrix *matrix= new CsrMatrix;
1316           matrix->num_rows = m;
1317           matrix->num_cols = A->cmap->n;
1318           matrix->num_entries = a->nz;
1319           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1320           matrix->row_offsets->assign(ii, ii + m+1);
1321 
1322           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1323           matrix->column_indices->assign(a->j, a->j+a->nz);
1324 
1325           matrix->values = new THRUSTARRAY(a->nz);
1326           matrix->values->assign(a->a, a->a+a->nz);
1327 
1328           /* assign the pointer */
1329           matstruct->mat = matrix;
1330 
1331         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1332           CsrMatrix *matrix= new CsrMatrix;
1333           matrix->num_rows = m;
1334           matrix->num_cols = A->cmap->n;
1335           matrix->num_entries = a->nz;
1336           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1337           matrix->row_offsets->assign(ii, ii + m+1);
1338 
1339           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1340           matrix->column_indices->assign(a->j, a->j+a->nz);
1341 
1342           matrix->values = new THRUSTARRAY(a->nz);
1343           matrix->values->assign(a->a, a->a+a->nz);
1344 
1345           cusparseHybMat_t hybMat;
1346           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1347           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1348             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1349           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1350               matstruct->descr, matrix->values->data().get(),
1351               matrix->row_offsets->data().get(),
1352               matrix->column_indices->data().get(),
1353               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1354           /* assign the pointer */
1355           matstruct->mat = hybMat;
1356 
1357           if (matrix) {
1358             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1359             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1360             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1361             delete (CsrMatrix*)matrix;
1362           }
1363         }
1364 
1365         /* assign the compressed row indices */
1366         if (a->compressedrow.use) {
1367           cusparsestruct->workVector = new THRUSTARRAY(m);
1368           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1369           matstruct->cprowIndices->assign(ridx,ridx+m);
1370           tmp = m;
1371         } else {
1372           cusparsestruct->workVector = NULL;
1373           matstruct->cprowIndices    = NULL;
1374           tmp = 0;
1375         }
1376         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1377 
1378         /* assign the pointer */
1379         cusparsestruct->mat = matstruct;
1380       } catch(char *ex) {
1381         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1382       }
1383       cusparsestruct->nonzerostate = A->nonzerostate;
1384     }
1385     err  = WaitForGPU();CHKERRCUDA(err);
1386     A->offloadmask = PETSC_OFFLOAD_BOTH;
1387     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1388   }
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 struct VecCUDAPlusEquals
1393 {
1394   template <typename Tuple>
1395   __host__ __device__
1396   void operator()(Tuple t)
1397   {
1398     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1399   }
1400 };
1401 
1402 struct VecCUDAEqualsReverse
1403 {
1404   template <typename Tuple>
1405   __host__ __device__
1406   void operator()(Tuple t)
1407   {
1408     thrust::get<0>(t) = thrust::get<1>(t);
1409   }
1410 };
1411 
1412 typedef struct {
1413   PetscBool   cisdense;
1414   PetscScalar *Bt;
1415   Mat         X;
1416 } MatMatCusparse;
1417 
1418 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1419 {
1420   PetscErrorCode ierr;
1421   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1422   cudaError_t    cerr;
1423 
1424   PetscFunctionBegin;
1425   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1426   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1427   ierr = PetscFree(data);CHKERRQ(ierr);
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1432 
1433 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1434 {
1435   Mat_Product                  *product = C->product;
1436   Mat                          A,B;
1437   PetscInt                     m,n,k,blda,clda;
1438   PetscBool                    flg,biscuda;
1439   Mat_SeqAIJCUSPARSE           *cusp;
1440   cusparseStatus_t             stat;
1441   cusparseOperation_t          opA;
1442   cusparseMatDescr_t           matA;
1443   const PetscScalar            *barray;
1444   PetscScalar                  *carray;
1445   PetscErrorCode               ierr;
1446   MatMatCusparse               *mmdata;
1447   Mat_SeqAIJCUSPARSEMultStruct *mat;
1448   CsrMatrix                    *csrmat;
1449   cudaError_t                  cuer;
1450 
1451   PetscFunctionBegin;
1452   MatCheckProduct(C,1);
1453   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1454   mmdata = (MatMatCusparse*)product->data;
1455   A    = product->A;
1456   B    = product->B;
1457   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1458   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1459   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1460      Instead of silently accepting the wrong answer, I prefer to raise the error */
1461   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1462   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1463   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1464   switch (product->type) {
1465   case MATPRODUCT_AB:
1466   case MATPRODUCT_PtAP:
1467     mat = cusp->mat;
1468     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1469     m   = A->rmap->n;
1470     k   = A->cmap->n;
1471     n   = B->cmap->n;
1472     break;
1473   case MATPRODUCT_AtB:
1474     if (!cusp->transgen) {
1475       mat = cusp->mat;
1476       opA = CUSPARSE_OPERATION_TRANSPOSE;
1477     } else {
1478       if (!cusp->matTranspose) {
1479         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1480       }
1481       mat = cusp->matTranspose;
1482       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1483     }
1484     m = A->cmap->n;
1485     k = A->rmap->n;
1486     n = B->cmap->n;
1487     break;
1488   case MATPRODUCT_ABt:
1489   case MATPRODUCT_RARt:
1490     mat = cusp->mat;
1491     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1492     m   = A->rmap->n;
1493     k   = B->cmap->n;
1494     n   = B->rmap->n;
1495     break;
1496   default:
1497     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1498   }
1499   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1500   matA   = mat->descr;
1501   csrmat = (CsrMatrix*)mat->mat;
1502   /* if the user passed a CPU matrix, copy the data to the GPU */
1503   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1504   if (!biscuda) {
1505     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1506   }
1507   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1508   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1509   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1510     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1511     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1512   } else {
1513     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1514     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1515   }
1516 
1517   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1518 
1519   /* cusparse spmm does not support transpose on B */
1520   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1521     cublasHandle_t cublasv2handle;
1522     cublasStatus_t cerr;
1523 
1524     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1525     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1526                        B->cmap->n,B->rmap->n,
1527                        &PETSC_CUSPARSE_ONE ,barray,blda,
1528                        &PETSC_CUSPARSE_ZERO,barray,blda,
1529                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1530     blda = B->cmap->n;
1531   }
1532 
1533   /* perform the MatMat operation */
1534   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1535                            csrmat->num_entries,mat->alpha,matA,
1536                            csrmat->values->data().get(),
1537                            csrmat->row_offsets->data().get(),
1538                            csrmat->column_indices->data().get(),
1539                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1540                            carray,clda);CHKERRCUSPARSE(stat);
1541   cuer = WaitForGPU();CHKERRCUDA(cuer);
1542   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1543   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1544   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1545   if (product->type == MATPRODUCT_RARt) {
1546     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1547     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1548   } else if (product->type == MATPRODUCT_PtAP) {
1549     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1550     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1551   } else {
1552     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1553   }
1554   if (mmdata->cisdense) {
1555     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1556   }
1557   if (!biscuda) {
1558     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1559   }
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1564 {
1565   Mat_Product        *product = C->product;
1566   Mat                A,B;
1567   PetscInt           m,n;
1568   PetscBool          cisdense,flg;
1569   PetscErrorCode     ierr;
1570   MatMatCusparse     *mmdata;
1571   Mat_SeqAIJCUSPARSE *cusp;
1572 
1573   PetscFunctionBegin;
1574   MatCheckProduct(C,1);
1575   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1576   A    = product->A;
1577   B    = product->B;
1578   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1579   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1580   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1581   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1582   switch (product->type) {
1583   case MATPRODUCT_AB:
1584     m = A->rmap->n;
1585     n = B->cmap->n;
1586     break;
1587   case MATPRODUCT_AtB:
1588     m = A->cmap->n;
1589     n = B->cmap->n;
1590     break;
1591   case MATPRODUCT_ABt:
1592     m = A->rmap->n;
1593     n = B->rmap->n;
1594     break;
1595   case MATPRODUCT_PtAP:
1596     m = B->cmap->n;
1597     n = B->cmap->n;
1598     break;
1599   case MATPRODUCT_RARt:
1600     m = B->rmap->n;
1601     n = B->rmap->n;
1602     break;
1603   default:
1604     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1605   }
1606   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1607   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1608   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1609   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1610 
1611   /* product data */
1612   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1613   mmdata->cisdense = cisdense;
1614   /* cusparse spmm does not support transpose on B */
1615   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1616     cudaError_t cerr;
1617 
1618     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1619   }
1620   /* for these products we need intermediate storage */
1621   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1622     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1623     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1624     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1625       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1626     } else {
1627       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1628     }
1629   }
1630   C->product->data    = mmdata;
1631   C->product->destroy = MatDestroy_MatMatCusparse;
1632 
1633   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1638 
1639 /* handles dense B */
1640 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1641 {
1642   Mat_Product    *product = C->product;
1643   PetscErrorCode ierr;
1644 
1645   PetscFunctionBegin;
1646   MatCheckProduct(C,1);
1647   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1648   if (product->A->boundtocpu) {
1649     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1650     PetscFunctionReturn(0);
1651   }
1652   switch (product->type) {
1653   case MATPRODUCT_AB:
1654   case MATPRODUCT_AtB:
1655   case MATPRODUCT_ABt:
1656   case MATPRODUCT_PtAP:
1657   case MATPRODUCT_RARt:
1658     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1659   default:
1660     break;
1661   }
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1666 {
1667   PetscErrorCode ierr;
1668 
1669   PetscFunctionBegin;
1670   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1675 {
1676   PetscErrorCode ierr;
1677 
1678   PetscFunctionBegin;
1679   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1680   PetscFunctionReturn(0);
1681 }
1682 
1683 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1684 {
1685   PetscErrorCode ierr;
1686 
1687   PetscFunctionBegin;
1688   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1693 {
1694   PetscErrorCode ierr;
1695 
1696   PetscFunctionBegin;
1697   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1702 {
1703   PetscErrorCode ierr;
1704 
1705   PetscFunctionBegin;
1706   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1707   PetscFunctionReturn(0);
1708 }
1709 
1710 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
1711 {
1712   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1713   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1714   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1715   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1716   PetscErrorCode               ierr;
1717   cudaError_t                  cerr;
1718   cusparseStatus_t             stat;
1719   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1720   PetscBool                    compressed;
1721 
1722   PetscFunctionBegin;
1723   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1724   if (!a->nonzerorowcnt) {
1725     if (!yy) {
1726       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1727     }
1728     PetscFunctionReturn(0);
1729   }
1730   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1731   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1732   if (!trans) {
1733     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1734   } else {
1735     if (herm || !cusparsestruct->transgen) {
1736       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1737       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1738     } else {
1739       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1740       if (!matstruct) {
1741         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1742         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1743       }
1744     }
1745   }
1746   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1747   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1748 
1749   try {
1750     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1751     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1752     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1753     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1754       xptr = xarray;
1755       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1756       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1757     } else {
1758       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1759       dptr = zarray;
1760       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1761 
1762       if (compressed) {
1763         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1764 
1765         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1766         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1767                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1768                          VecCUDAEqualsReverse());
1769         cerr = WaitForGPU();CHKERRCUDA(cerr);
1770         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1771       }
1772     }
1773 
1774     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1775     /* csr_spmv does y = alpha*Ax + beta*y */
1776     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1777       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1778       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1779                                mat->num_rows, mat->num_cols,
1780                                mat->num_entries, matstruct->alpha, matstruct->descr,
1781                                mat->values->data().get(), mat->row_offsets->data().get(),
1782                                mat->column_indices->data().get(), xptr, beta,
1783                                dptr);CHKERRCUSPARSE(stat);
1784     } else {
1785       if (cusparsestruct->nrows) {
1786         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1787         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1788                                  matstruct->alpha, matstruct->descr, hybMat,
1789                                  xptr, beta,
1790                                  dptr);CHKERRCUSPARSE(stat);
1791       }
1792     }
1793     cerr = WaitForGPU();CHKERRCUDA(cerr);
1794     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1795 
1796     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1797       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1798         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1799           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1800         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1801           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1802         }
1803       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1804         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1805       }
1806 
1807       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1808       if (compressed) {
1809         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1810 
1811         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1812         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1813                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1814                          VecCUDAPlusEquals());
1815         cerr = WaitForGPU();CHKERRCUDA(cerr);
1816         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1817       }
1818     } else {
1819       if (yy && yy != zz) {
1820         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1821       }
1822     }
1823     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1824     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1825     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1826   } catch(char *ex) {
1827     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1828   }
1829   if (yy) {
1830     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1831   } else {
1832     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1833   }
1834   PetscFunctionReturn(0);
1835 }
1836 
1837 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1838 {
1839   PetscErrorCode ierr;
1840 
1841   PetscFunctionBegin;
1842   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1847 {
1848   PetscErrorCode ierr;
1849 
1850   PetscFunctionBegin;
1851   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1852   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1853   if (A->factortype == MAT_FACTOR_NONE) {
1854     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1855   }
1856   PetscFunctionReturn(0);
1857 }
1858 
1859 /* --------------------------------------------------------------------------------*/
1860 /*@
1861    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1862    (the default parallel PETSc format). This matrix will ultimately pushed down
1863    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1864    assembly performance the user should preallocate the matrix storage by setting
1865    the parameter nz (or the array nnz).  By setting these parameters accurately,
1866    performance during matrix assembly can be increased by more than a factor of 50.
1867 
1868    Collective
1869 
1870    Input Parameters:
1871 +  comm - MPI communicator, set to PETSC_COMM_SELF
1872 .  m - number of rows
1873 .  n - number of columns
1874 .  nz - number of nonzeros per row (same for all rows)
1875 -  nnz - array containing the number of nonzeros in the various rows
1876          (possibly different for each row) or NULL
1877 
1878    Output Parameter:
1879 .  A - the matrix
1880 
1881    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1882    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1883    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1884 
1885    Notes:
1886    If nnz is given then nz is ignored
1887 
1888    The AIJ format (also called the Yale sparse matrix format or
1889    compressed row storage), is fully compatible with standard Fortran 77
1890    storage.  That is, the stored row and column indices can begin at
1891    either one (as in Fortran) or zero.  See the users' manual for details.
1892 
1893    Specify the preallocated storage with either nz or nnz (not both).
1894    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1895    allocation.  For large problems you MUST preallocate memory or you
1896    will get TERRIBLE performance, see the users' manual chapter on matrices.
1897 
1898    By default, this format uses inodes (identical nodes) when possible, to
1899    improve numerical efficiency of matrix-vector products and solves. We
1900    search for consecutive rows with the same nonzero structure, thereby
1901    reusing matrix information to achieve increased efficiency.
1902 
1903    Level: intermediate
1904 
1905 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1906 @*/
1907 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1908 {
1909   PetscErrorCode ierr;
1910 
1911   PetscFunctionBegin;
1912   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1913   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1914   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1915   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1916   PetscFunctionReturn(0);
1917 }
1918 
1919 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1920 {
1921   PetscErrorCode ierr;
1922 
1923   PetscFunctionBegin;
1924   if (A->factortype == MAT_FACTOR_NONE) {
1925     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1926   } else {
1927     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1928   }
1929   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1930   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1931   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1932   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1933   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1934   PetscFunctionReturn(0);
1935 }
1936 
1937 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1938 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1939 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1940 {
1941   PetscErrorCode ierr;
1942 
1943   PetscFunctionBegin;
1944   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1945   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1946   PetscFunctionReturn(0);
1947 }
1948 
1949 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1950 {
1951   PetscErrorCode ierr;
1952 
1953   PetscFunctionBegin;
1954   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1955   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1956      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1957      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
1958      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1959            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1960   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1961   /* TODO: add support for this? */
1962   if (flg) {
1963     A->ops->mult                      = MatMult_SeqAIJ;
1964     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1965     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1966     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1967     A->ops->multhermitiantranspose    = NULL;
1968     A->ops->multhermitiantransposeadd = NULL;
1969     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1970     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1971   } else {
1972     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
1973     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
1974     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
1975     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1976     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
1977     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
1978     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
1979     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
1980   }
1981   A->boundtocpu = flg;
1982   PetscFunctionReturn(0);
1983 }
1984 
1985 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1986 {
1987   PetscErrorCode   ierr;
1988   cusparseStatus_t stat;
1989   Mat              B;
1990 
1991   PetscFunctionBegin;
1992   if (reuse == MAT_INITIAL_MATRIX) {
1993     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
1994   } else if (reuse == MAT_REUSE_MATRIX) {
1995     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
1996   }
1997   B = *newmat;
1998 
1999   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2000   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2001 
2002   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2003     if (B->factortype == MAT_FACTOR_NONE) {
2004       Mat_SeqAIJCUSPARSE *spptr;
2005 
2006       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2007       spptr->format = MAT_CUSPARSE_CSR;
2008       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2009       B->spptr = spptr;
2010     } else {
2011       Mat_SeqAIJCUSPARSETriFactors *spptr;
2012 
2013       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2014       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2015       B->spptr = spptr;
2016     }
2017     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2018   }
2019   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2020   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2021   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2022   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2023   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2024 
2025   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2026   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2027   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2028   PetscFunctionReturn(0);
2029 }
2030 
2031 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2032 {
2033   PetscErrorCode ierr;
2034 
2035   PetscFunctionBegin;
2036   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2037   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2038   PetscFunctionReturn(0);
2039 }
2040 
2041 /*MC
2042    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2043 
2044    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2045    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2046    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2047 
2048    Options Database Keys:
2049 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2050 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2051 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2052 
2053   Level: beginner
2054 
2055 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2056 M*/
2057 
2058 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2059 
2060 
2061 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2062 {
2063   PetscErrorCode ierr;
2064 
2065   PetscFunctionBegin;
2066   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2067   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2068   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2069   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2070   PetscFunctionReturn(0);
2071 }
2072 
2073 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2074 {
2075   PetscErrorCode   ierr;
2076   cusparseStatus_t stat;
2077   cusparseHandle_t handle;
2078 
2079   PetscFunctionBegin;
2080   if (*cusparsestruct) {
2081     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2082     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2083     delete (*cusparsestruct)->workVector;
2084     delete (*cusparsestruct)->rowoffsets_gpu;
2085     if (handle = (*cusparsestruct)->handle) {
2086       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2087     }
2088     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2089   }
2090   PetscFunctionReturn(0);
2091 }
2092 
2093 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2094 {
2095   PetscFunctionBegin;
2096   if (*mat) {
2097     delete (*mat)->values;
2098     delete (*mat)->column_indices;
2099     delete (*mat)->row_offsets;
2100     delete *mat;
2101     *mat = 0;
2102   }
2103   PetscFunctionReturn(0);
2104 }
2105 
2106 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2107 {
2108   cusparseStatus_t stat;
2109   PetscErrorCode   ierr;
2110 
2111   PetscFunctionBegin;
2112   if (*trifactor) {
2113     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2114     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2115     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2116     delete *trifactor;
2117     *trifactor = 0;
2118   }
2119   PetscFunctionReturn(0);
2120 }
2121 
2122 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2123 {
2124   CsrMatrix        *mat;
2125   cusparseStatus_t stat;
2126   cudaError_t      err;
2127 
2128   PetscFunctionBegin;
2129   if (*matstruct) {
2130     if ((*matstruct)->mat) {
2131       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2132         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2133         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2134       } else {
2135         mat = (CsrMatrix*)(*matstruct)->mat;
2136         CsrMatrix_Destroy(&mat);
2137       }
2138     }
2139     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2140     delete (*matstruct)->cprowIndices;
2141     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2142     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2143     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2144     delete *matstruct;
2145     *matstruct = 0;
2146   }
2147   PetscFunctionReturn(0);
2148 }
2149 
2150 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2151 {
2152   PetscErrorCode ierr;
2153 
2154   PetscFunctionBegin;
2155   if (*trifactors) {
2156     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2157     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2158     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2159     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2160     delete (*trifactors)->rpermIndices;
2161     delete (*trifactors)->cpermIndices;
2162     delete (*trifactors)->workVector;
2163     (*trifactors)->rpermIndices = 0;
2164     (*trifactors)->cpermIndices = 0;
2165     (*trifactors)->workVector = 0;
2166   }
2167   PetscFunctionReturn(0);
2168 }
2169 
2170 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2171 {
2172   PetscErrorCode   ierr;
2173   cusparseHandle_t handle;
2174   cusparseStatus_t stat;
2175 
2176   PetscFunctionBegin;
2177   if (*trifactors) {
2178     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2179     if (handle = (*trifactors)->handle) {
2180       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2181     }
2182     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2183   }
2184   PetscFunctionReturn(0);
2185 }
2186