xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 85ba735771a8b5cec8c45d73c93d760c26689b57)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
39 
40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
46 
47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48 {
49   cusparseStatus_t   stat;
50   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
51 
52   PetscFunctionBegin;
53   cusparsestruct->stream = stream;
54   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55   PetscFunctionReturn(0);
56 }
57 
58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59 {
60   cusparseStatus_t   stat;
61   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62 
63   PetscFunctionBegin;
64   if (cusparsestruct->handle != handle) {
65     if (cusparsestruct->handle) {
66       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
67     }
68     cusparsestruct->handle = handle;
69   }
70   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75 {
76   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
77 
78   PetscFunctionBegin;
79   if (cusparsestruct->handle) cusparsestruct->handle = 0;
80   PetscFunctionReturn(0);
81 }
82 
83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
129 {
130   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
131 
132   PetscFunctionBegin;
133   switch (op) {
134   case MAT_CUSPARSE_MULT:
135     cusparsestruct->format = format;
136     break;
137   case MAT_CUSPARSE_ALL:
138     cusparsestruct->format = format;
139     break;
140   default:
141     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
142   }
143   PetscFunctionReturn(0);
144 }
145 
146 /*@
147    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
148    operation. Only the MatMult operation can use different GPU storage formats
149    for MPIAIJCUSPARSE matrices.
150    Not Collective
151 
152    Input Parameters:
153 +  A - Matrix of type SEQAIJCUSPARSE
154 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
155 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
156 
157    Output Parameter:
158 
159    Level: intermediate
160 
161 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
162 @*/
163 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
164 {
165   PetscErrorCode ierr;
166 
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 /*@
174    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
175 
176    Collective on mat
177 
178    Input Parameters:
179 +  A - Matrix of type SEQAIJCUSPARSE
180 -  transgen - the boolean flag
181 
182    Level: intermediate
183 
184 .seealso: MATSEQAIJCUSPARSE
185 @*/
186 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
187 {
188   PetscErrorCode ierr;
189   PetscBool      flg;
190 
191   PetscFunctionBegin;
192   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
193   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
194   if (flg) {
195     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
196     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
197     cusp->transgen = transgen;
198   }
199   PetscFunctionReturn(0);
200 }
201 
202 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
203 {
204   PetscErrorCode           ierr;
205   MatCUSPARSEStorageFormat format;
206   PetscBool                flg;
207   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
208 
209   PetscFunctionBegin;
210   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
211   if (A->factortype == MAT_FACTOR_NONE) {
212     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",cusparsestruct->transgen,&cusparsestruct->transgen,NULL);CHKERRQ(ierr);
213     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
214                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
215     if (flg) {
216       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
217     }
218   }
219   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
220                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
221   if (flg) {
222     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
223   }
224   ierr = PetscOptionsTail();CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
229 {
230   PetscErrorCode ierr;
231 
232   PetscFunctionBegin;
233   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
234   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
244   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
254   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
255   PetscFunctionReturn(0);
256 }
257 
258 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259 {
260   PetscErrorCode ierr;
261 
262   PetscFunctionBegin;
263   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
264   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265   PetscFunctionReturn(0);
266 }
267 
268 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
269 {
270   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
271   PetscInt                          n = A->rmap->n;
272   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
273   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
274   cusparseStatus_t                  stat;
275   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
276   const MatScalar                   *aa = a->a,*v;
277   PetscInt                          *AiLo, *AjLo;
278   PetscScalar                       *AALo;
279   PetscInt                          i,nz, nzLower, offset, rowOffset;
280   PetscErrorCode                    ierr;
281   cudaError_t                       cerr;
282 
283   PetscFunctionBegin;
284   if (!n) PetscFunctionReturn(0);
285   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
286     try {
287       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
288       nzLower=n+ai[n]-ai[1];
289 
290       /* Allocate Space for the lower triangular matrix */
291       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
292       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
293       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
294 
295       /* Fill the lower triangular matrix */
296       AiLo[0]  = (PetscInt) 0;
297       AiLo[n]  = nzLower;
298       AjLo[0]  = (PetscInt) 0;
299       AALo[0]  = (MatScalar) 1.0;
300       v        = aa;
301       vi       = aj;
302       offset   = 1;
303       rowOffset= 1;
304       for (i=1; i<n; i++) {
305         nz = ai[i+1] - ai[i];
306         /* additional 1 for the term on the diagonal */
307         AiLo[i]    = rowOffset;
308         rowOffset += nz+1;
309 
310         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
311         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
312 
313         offset      += nz;
314         AjLo[offset] = (PetscInt) i;
315         AALo[offset] = (MatScalar) 1.0;
316         offset      += 1;
317 
318         v  += nz;
319         vi += nz;
320       }
321 
322       /* allocate space for the triangular factor information */
323       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
324 
325       /* Create the matrix description */
326       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
327       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
328       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
329       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
330       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
331 
332       /* Create the solve analysis information */
333       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
334 
335       /* set the operation */
336       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
337 
338       /* set the matrix */
339       loTriFactor->csrMat = new CsrMatrix;
340       loTriFactor->csrMat->num_rows = n;
341       loTriFactor->csrMat->num_cols = n;
342       loTriFactor->csrMat->num_entries = nzLower;
343 
344       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
345       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
346 
347       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
348       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
349 
350       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
351       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
352 
353       /* perform the solve analysis */
354       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
358 
359       /* assign the pointer. Is this really necessary? */
360       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
361 
362       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
363       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
364       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
365       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
366     } catch(char *ex) {
367       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
368     }
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
374 {
375   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
376   PetscInt                          n = A->rmap->n;
377   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
378   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
379   cusparseStatus_t                  stat;
380   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
381   const MatScalar                   *aa = a->a,*v;
382   PetscInt                          *AiUp, *AjUp;
383   PetscScalar                       *AAUp;
384   PetscInt                          i,nz, nzUpper, offset;
385   PetscErrorCode                    ierr;
386   cudaError_t                       cerr;
387 
388   PetscFunctionBegin;
389   if (!n) PetscFunctionReturn(0);
390   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
391     try {
392       /* next, figure out the number of nonzeros in the upper triangular matrix. */
393       nzUpper = adiag[0]-adiag[n];
394 
395       /* Allocate Space for the upper triangular matrix */
396       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
397       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
398       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
399 
400       /* Fill the upper triangular matrix */
401       AiUp[0]=(PetscInt) 0;
402       AiUp[n]=nzUpper;
403       offset = nzUpper;
404       for (i=n-1; i>=0; i--) {
405         v  = aa + adiag[i+1] + 1;
406         vi = aj + adiag[i+1] + 1;
407 
408         /* number of elements NOT on the diagonal */
409         nz = adiag[i] - adiag[i+1]-1;
410 
411         /* decrement the offset */
412         offset -= (nz+1);
413 
414         /* first, set the diagonal elements */
415         AjUp[offset] = (PetscInt) i;
416         AAUp[offset] = (MatScalar)1./v[nz];
417         AiUp[i]      = AiUp[i+1] - (nz+1);
418 
419         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
420         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
421       }
422 
423       /* allocate space for the triangular factor information */
424       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
425 
426       /* Create the matrix description */
427       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
428       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
429       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
430       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
431       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
432 
433       /* Create the solve analysis information */
434       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
435 
436       /* set the operation */
437       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438 
439       /* set the matrix */
440       upTriFactor->csrMat = new CsrMatrix;
441       upTriFactor->csrMat->num_rows = n;
442       upTriFactor->csrMat->num_cols = n;
443       upTriFactor->csrMat->num_entries = nzUpper;
444 
445       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
447 
448       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
449       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
450 
451       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
452       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
453 
454       /* perform the solve analysis */
455       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
456                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
457                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
458                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459 
460       /* assign the pointer. Is this really necessary? */
461       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
462 
463       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
464       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
465       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
466       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
467     } catch(char *ex) {
468       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
469     }
470   }
471   PetscFunctionReturn(0);
472 }
473 
474 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
475 {
476   PetscErrorCode               ierr;
477   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
478   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   IS                           isrow = a->row,iscol = a->icol;
480   PetscBool                    row_identity,col_identity;
481   const PetscInt               *r,*c;
482   PetscInt                     n = A->rmap->n;
483 
484   PetscFunctionBegin;
485   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
486   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
487   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
488 
489   cusparseTriFactors->workVector = new THRUSTARRAY(n);
490   cusparseTriFactors->nnz=a->nz;
491 
492   A->offloadmask = PETSC_OFFLOAD_BOTH;
493   /* lower triangular indices */
494   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
495   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
496   if (!row_identity) {
497     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
498     cusparseTriFactors->rpermIndices->assign(r, r+n);
499   }
500   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
501 
502   /* upper triangular indices */
503   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
504   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
505   if (!col_identity) {
506     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
507     cusparseTriFactors->cpermIndices->assign(c, c+n);
508   }
509 
510   if (!row_identity && !col_identity) {
511     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
512   } else if(!row_identity) {
513     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
514   } else if(!col_identity) {
515     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
516   }
517 
518   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
519   PetscFunctionReturn(0);
520 }
521 
522 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
523 {
524   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
525   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
526   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
527   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
528   cusparseStatus_t                  stat;
529   PetscErrorCode                    ierr;
530   cudaError_t                       cerr;
531   PetscInt                          *AiUp, *AjUp;
532   PetscScalar                       *AAUp;
533   PetscScalar                       *AALo;
534   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
535   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
536   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
537   const MatScalar                   *aa = b->a,*v;
538 
539   PetscFunctionBegin;
540   if (!n) PetscFunctionReturn(0);
541   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
542     try {
543       /* Allocate Space for the upper triangular matrix */
544       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
545       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
546       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
547       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
548 
549       /* Fill the upper triangular matrix */
550       AiUp[0]=(PetscInt) 0;
551       AiUp[n]=nzUpper;
552       offset = 0;
553       for (i=0; i<n; i++) {
554         /* set the pointers */
555         v  = aa + ai[i];
556         vj = aj + ai[i];
557         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
558 
559         /* first, set the diagonal elements */
560         AjUp[offset] = (PetscInt) i;
561         AAUp[offset] = (MatScalar)1.0/v[nz];
562         AiUp[i]      = offset;
563         AALo[offset] = (MatScalar)1.0/v[nz];
564 
565         offset+=1;
566         if (nz>0) {
567           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
568           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
569           for (j=offset; j<offset+nz; j++) {
570             AAUp[j] = -AAUp[j];
571             AALo[j] = AAUp[j]/v[nz];
572           }
573           offset+=nz;
574         }
575       }
576 
577       /* allocate space for the triangular factor information */
578       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
579 
580       /* Create the matrix description */
581       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
582       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
583       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
584       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
585       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
586 
587       /* Create the solve analysis information */
588       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
589 
590       /* set the operation */
591       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
592 
593       /* set the matrix */
594       upTriFactor->csrMat = new CsrMatrix;
595       upTriFactor->csrMat->num_rows = A->rmap->n;
596       upTriFactor->csrMat->num_cols = A->cmap->n;
597       upTriFactor->csrMat->num_entries = a->nz;
598 
599       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
600       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
601 
602       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
603       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
604 
605       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
606       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
607 
608       /* perform the solve analysis */
609       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
613 
614       /* assign the pointer. Is this really necessary? */
615       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
616 
617       /* allocate space for the triangular factor information */
618       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
619 
620       /* Create the matrix description */
621       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
622       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
623       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
624       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
625       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
626 
627       /* Create the solve analysis information */
628       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
629 
630       /* set the operation */
631       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
632 
633       /* set the matrix */
634       loTriFactor->csrMat = new CsrMatrix;
635       loTriFactor->csrMat->num_rows = A->rmap->n;
636       loTriFactor->csrMat->num_cols = A->cmap->n;
637       loTriFactor->csrMat->num_entries = a->nz;
638 
639       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
640       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
641 
642       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
643       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
644 
645       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
646       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
647       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
648 
649       /* perform the solve analysis */
650       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
651                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
652                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
653                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
654 
655       /* assign the pointer. Is this really necessary? */
656       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
657 
658       A->offloadmask = PETSC_OFFLOAD_BOTH;
659       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
660       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
661       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
662       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
663     } catch(char *ex) {
664       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
665     }
666   }
667   PetscFunctionReturn(0);
668 }
669 
670 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
671 {
672   PetscErrorCode               ierr;
673   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
674   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
675   IS                           ip = a->row;
676   const PetscInt               *rip;
677   PetscBool                    perm_identity;
678   PetscInt                     n = A->rmap->n;
679 
680   PetscFunctionBegin;
681   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
682   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
683   cusparseTriFactors->workVector = new THRUSTARRAY(n);
684   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
685 
686   /* lower triangular indices */
687   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
688   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
689   if (!perm_identity) {
690     IS             iip;
691     const PetscInt *irip;
692 
693     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
694     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
695     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
696     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
697     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
699     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
700     ierr = ISDestroy(&iip);CHKERRQ(ierr);
701     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
702   }
703   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
707 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
708 {
709   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
710   IS             isrow = b->row,iscol = b->col;
711   PetscBool      row_identity,col_identity;
712   PetscErrorCode ierr;
713 
714   PetscFunctionBegin;
715   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
716   B->offloadmask = PETSC_OFFLOAD_CPU;
717   /* determine which version of MatSolve needs to be used. */
718   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
719   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
720   if (row_identity && col_identity) {
721     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
722     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
723     B->ops->matsolve = NULL;
724     B->ops->matsolvetranspose = NULL;
725   } else {
726     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
727     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
728     B->ops->matsolve = NULL;
729     B->ops->matsolvetranspose = NULL;
730   }
731 
732   /* get the triangular factors */
733   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
738 {
739   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
740   IS             ip = b->row;
741   PetscBool      perm_identity;
742   PetscErrorCode ierr;
743 
744   PetscFunctionBegin;
745   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
746   B->offloadmask = PETSC_OFFLOAD_CPU;
747   /* determine which version of MatSolve needs to be used. */
748   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
749   if (perm_identity) {
750     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
751     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
752     B->ops->matsolve = NULL;
753     B->ops->matsolvetranspose = NULL;
754   } else {
755     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
756     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
757     B->ops->matsolve = NULL;
758     B->ops->matsolvetranspose = NULL;
759   }
760 
761   /* get the triangular factors */
762   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
763   PetscFunctionReturn(0);
764 }
765 
766 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
767 {
768   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
769   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
770   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
771   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
772   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
773   cusparseStatus_t                  stat;
774   cusparseIndexBase_t               indexBase;
775   cusparseMatrixType_t              matrixType;
776   cusparseFillMode_t                fillMode;
777   cusparseDiagType_t                diagType;
778 
779   PetscFunctionBegin;
780 
781   /*********************************************/
782   /* Now the Transpose of the Lower Tri Factor */
783   /*********************************************/
784 
785   /* allocate space for the transpose of the lower triangular factor */
786   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
787 
788   /* set the matrix descriptors of the lower triangular factor */
789   matrixType = cusparseGetMatType(loTriFactor->descr);
790   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
791   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
792     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
793   diagType = cusparseGetMatDiagType(loTriFactor->descr);
794 
795   /* Create the matrix description */
796   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
797   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
798   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
799   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
800   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
801 
802   /* Create the solve analysis information */
803   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
804 
805   /* set the operation */
806   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
807 
808   /* allocate GPU space for the CSC of the lower triangular factor*/
809   loTriFactorT->csrMat = new CsrMatrix;
810   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
811   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
812   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
813   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
814   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
815   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
816 
817   /* compute the transpose of the lower triangular factor, i.e. the CSC */
818   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
819                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
820                           loTriFactor->csrMat->values->data().get(),
821                           loTriFactor->csrMat->row_offsets->data().get(),
822                           loTriFactor->csrMat->column_indices->data().get(),
823                           loTriFactorT->csrMat->values->data().get(),
824                           loTriFactorT->csrMat->column_indices->data().get(),
825                           loTriFactorT->csrMat->row_offsets->data().get(),
826                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
827 
828   /* perform the solve analysis on the transposed matrix */
829   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
830                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
831                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
832                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
833                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
834 
835   /* assign the pointer. Is this really necessary? */
836   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
837 
838   /*********************************************/
839   /* Now the Transpose of the Upper Tri Factor */
840   /*********************************************/
841 
842   /* allocate space for the transpose of the upper triangular factor */
843   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
844 
845   /* set the matrix descriptors of the upper triangular factor */
846   matrixType = cusparseGetMatType(upTriFactor->descr);
847   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
848   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
849     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
850   diagType = cusparseGetMatDiagType(upTriFactor->descr);
851 
852   /* Create the matrix description */
853   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
854   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
855   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
856   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
857   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
858 
859   /* Create the solve analysis information */
860   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
861 
862   /* set the operation */
863   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
864 
865   /* allocate GPU space for the CSC of the upper triangular factor*/
866   upTriFactorT->csrMat = new CsrMatrix;
867   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
868   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
869   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
870   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
871   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
872   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
873 
874   /* compute the transpose of the upper triangular factor, i.e. the CSC */
875   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
876                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
877                           upTriFactor->csrMat->values->data().get(),
878                           upTriFactor->csrMat->row_offsets->data().get(),
879                           upTriFactor->csrMat->column_indices->data().get(),
880                           upTriFactorT->csrMat->values->data().get(),
881                           upTriFactorT->csrMat->column_indices->data().get(),
882                           upTriFactorT->csrMat->row_offsets->data().get(),
883                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
884 
885   /* perform the solve analysis on the transposed matrix */
886   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
887                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
888                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
889                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
890                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
891 
892   /* assign the pointer. Is this really necessary? */
893   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
894   PetscFunctionReturn(0);
895 }
896 
897 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
898 {
899   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
900   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
901   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
902   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
903   cusparseStatus_t             stat;
904   cusparseIndexBase_t          indexBase;
905   cudaError_t                  err;
906   PetscErrorCode               ierr;
907 
908   PetscFunctionBegin;
909   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
910   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
911   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
912   /* create cusparse matrix */
913   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
914   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
915   indexBase = cusparseGetMatIndexBase(matstruct->descr);
916   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
917   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
918 
919   /* set alpha and beta */
920   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
921   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
922   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
923   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
924   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
925   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
926   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
927 
928   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
929     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
930     CsrMatrix *matrixT= new CsrMatrix;
931     matrixT->num_rows = A->cmap->n;
932     matrixT->num_cols = A->rmap->n;
933     matrixT->num_entries = a->nz;
934     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
935     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
936     matrixT->values = new THRUSTARRAY(a->nz);
937 
938     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
939     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
940     /* compute the transpose, i.e. the CSC */
941     indexBase = cusparseGetMatIndexBase(matstruct->descr);
942     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
943                             A->cmap->n, matrix->num_entries,
944                             matrix->values->data().get(),
945                             cusparsestruct->rowoffsets_gpu->data().get(),
946                             matrix->column_indices->data().get(),
947                             matrixT->values->data().get(),
948                             matrixT->column_indices->data().get(),
949                             matrixT->row_offsets->data().get(),
950                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
951     matstructT->mat = matrixT;
952   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
953     CsrMatrix *temp = new CsrMatrix;
954     CsrMatrix *tempT = new CsrMatrix;
955 
956     /* First convert HYB to CSR */
957     temp->num_rows = A->rmap->n;
958     temp->num_cols = A->cmap->n;
959     temp->num_entries = a->nz;
960     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
961     temp->column_indices = new THRUSTINTARRAY32(a->nz);
962     temp->values = new THRUSTARRAY(a->nz);
963 
964     stat = cusparse_hyb2csr(cusparsestruct->handle,
965                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
966                             temp->values->data().get(),
967                             temp->row_offsets->data().get(),
968                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
969 
970     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
971     tempT->num_rows = A->rmap->n;
972     tempT->num_cols = A->cmap->n;
973     tempT->num_entries = a->nz;
974     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
975     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
976     tempT->values = new THRUSTARRAY(a->nz);
977 
978     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
979                             temp->num_cols, temp->num_entries,
980                             temp->values->data().get(),
981                             temp->row_offsets->data().get(),
982                             temp->column_indices->data().get(),
983                             tempT->values->data().get(),
984                             tempT->column_indices->data().get(),
985                             tempT->row_offsets->data().get(),
986                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
987 
988     /* Last, convert CSC to HYB */
989     cusparseHybMat_t hybMat;
990     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
991     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
992       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
993     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
994                             matstructT->descr, tempT->values->data().get(),
995                             tempT->row_offsets->data().get(),
996                             tempT->column_indices->data().get(),
997                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
998 
999     /* assign the pointer */
1000     matstructT->mat = hybMat;
1001     /* delete temporaries */
1002     if (tempT) {
1003       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1004       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1005       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1006       delete (CsrMatrix*) tempT;
1007     }
1008     if (temp) {
1009       if (temp->values) delete (THRUSTARRAY*) temp->values;
1010       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1011       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1012       delete (CsrMatrix*) temp;
1013     }
1014   }
1015   err  = WaitForGPU();CHKERRCUDA(err);
1016   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1017   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1018   /* the compressed row indices is not used for matTranspose */
1019   matstructT->cprowIndices = NULL;
1020   /* assign the pointer */
1021   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1022   PetscFunctionReturn(0);
1023 }
1024 
1025 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1026 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1027 {
1028   PetscInt                              n = xx->map->n;
1029   const PetscScalar                     *barray;
1030   PetscScalar                           *xarray;
1031   thrust::device_ptr<const PetscScalar> bGPU;
1032   thrust::device_ptr<PetscScalar>       xGPU;
1033   cusparseStatus_t                      stat;
1034   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1035   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1036   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1037   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1038   PetscErrorCode                        ierr;
1039   cudaError_t                           cerr;
1040 
1041   PetscFunctionBegin;
1042   /* Analyze the matrix and create the transpose ... on the fly */
1043   if (!loTriFactorT && !upTriFactorT) {
1044     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1045     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1046     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1047   }
1048 
1049   /* Get the GPU pointers */
1050   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1051   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1052   xGPU = thrust::device_pointer_cast(xarray);
1053   bGPU = thrust::device_pointer_cast(barray);
1054 
1055   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1056   /* First, reorder with the row permutation */
1057   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1058                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1059                xGPU);
1060 
1061   /* First, solve U */
1062   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1063                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1064                         upTriFactorT->csrMat->values->data().get(),
1065                         upTriFactorT->csrMat->row_offsets->data().get(),
1066                         upTriFactorT->csrMat->column_indices->data().get(),
1067                         upTriFactorT->solveInfo,
1068                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1069 
1070   /* Then, solve L */
1071   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1072                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1073                         loTriFactorT->csrMat->values->data().get(),
1074                         loTriFactorT->csrMat->row_offsets->data().get(),
1075                         loTriFactorT->csrMat->column_indices->data().get(),
1076                         loTriFactorT->solveInfo,
1077                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1078 
1079   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1080   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1081                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1082                tempGPU->begin());
1083 
1084   /* Copy the temporary to the full solution. */
1085   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1086 
1087   /* restore */
1088   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1089   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1090   cerr = WaitForGPU();CHKERRCUDA(cerr);
1091   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1092   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1097 {
1098   const PetscScalar                 *barray;
1099   PetscScalar                       *xarray;
1100   cusparseStatus_t                  stat;
1101   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1102   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1103   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1104   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1105   PetscErrorCode                    ierr;
1106   cudaError_t                       cerr;
1107 
1108   PetscFunctionBegin;
1109   /* Analyze the matrix and create the transpose ... on the fly */
1110   if (!loTriFactorT && !upTriFactorT) {
1111     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1112     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1113     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1114   }
1115 
1116   /* Get the GPU pointers */
1117   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1118   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1119 
1120   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1121   /* First, solve U */
1122   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1123                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1124                         upTriFactorT->csrMat->values->data().get(),
1125                         upTriFactorT->csrMat->row_offsets->data().get(),
1126                         upTriFactorT->csrMat->column_indices->data().get(),
1127                         upTriFactorT->solveInfo,
1128                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1129 
1130   /* Then, solve L */
1131   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1132                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1133                         loTriFactorT->csrMat->values->data().get(),
1134                         loTriFactorT->csrMat->row_offsets->data().get(),
1135                         loTriFactorT->csrMat->column_indices->data().get(),
1136                         loTriFactorT->solveInfo,
1137                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1138 
1139   /* restore */
1140   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1141   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1142   cerr = WaitForGPU();CHKERRCUDA(cerr);
1143   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1144   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1145   PetscFunctionReturn(0);
1146 }
1147 
1148 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1149 {
1150   const PetscScalar                     *barray;
1151   PetscScalar                           *xarray;
1152   thrust::device_ptr<const PetscScalar> bGPU;
1153   thrust::device_ptr<PetscScalar>       xGPU;
1154   cusparseStatus_t                      stat;
1155   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1156   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1157   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1158   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1159   PetscErrorCode                        ierr;
1160   cudaError_t                           cerr;
1161 
1162   PetscFunctionBegin;
1163 
1164   /* Get the GPU pointers */
1165   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1166   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1167   xGPU = thrust::device_pointer_cast(xarray);
1168   bGPU = thrust::device_pointer_cast(barray);
1169 
1170   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1171   /* First, reorder with the row permutation */
1172   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1173                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1174                tempGPU->begin());
1175 
1176   /* Next, solve L */
1177   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1178                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1179                         loTriFactor->csrMat->values->data().get(),
1180                         loTriFactor->csrMat->row_offsets->data().get(),
1181                         loTriFactor->csrMat->column_indices->data().get(),
1182                         loTriFactor->solveInfo,
1183                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1184 
1185   /* Then, solve U */
1186   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1187                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1188                         upTriFactor->csrMat->values->data().get(),
1189                         upTriFactor->csrMat->row_offsets->data().get(),
1190                         upTriFactor->csrMat->column_indices->data().get(),
1191                         upTriFactor->solveInfo,
1192                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1193 
1194   /* Last, reorder with the column permutation */
1195   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1196                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1197                xGPU);
1198 
1199   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1200   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1201   cerr = WaitForGPU();CHKERRCUDA(cerr);
1202   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1203   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1208 {
1209   const PetscScalar                 *barray;
1210   PetscScalar                       *xarray;
1211   cusparseStatus_t                  stat;
1212   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1213   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1214   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1215   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1216   PetscErrorCode                    ierr;
1217   cudaError_t                       cerr;
1218 
1219   PetscFunctionBegin;
1220   /* Get the GPU pointers */
1221   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1222   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1223 
1224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225   /* First, solve L */
1226   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1227                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1228                         loTriFactor->csrMat->values->data().get(),
1229                         loTriFactor->csrMat->row_offsets->data().get(),
1230                         loTriFactor->csrMat->column_indices->data().get(),
1231                         loTriFactor->solveInfo,
1232                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1233 
1234   /* Next, solve U */
1235   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1236                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1237                         upTriFactor->csrMat->values->data().get(),
1238                         upTriFactor->csrMat->row_offsets->data().get(),
1239                         upTriFactor->csrMat->column_indices->data().get(),
1240                         upTriFactor->solveInfo,
1241                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1242 
1243   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1244   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1245   cerr = WaitForGPU();CHKERRCUDA(cerr);
1246   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1247   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1252 {
1253   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1254   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1255   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1256   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1257   PetscErrorCode               ierr;
1258   cusparseStatus_t             stat;
1259   cudaError_t                  err;
1260 
1261   PetscFunctionBegin;
1262   if (A->boundtocpu) PetscFunctionReturn(0);
1263   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1264     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1265       /* Copy values only */
1266       CsrMatrix *mat,*matT;
1267       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1268 
1269       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1270       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1271       mat->values->assign(a->a, a->a+a->nz);
1272       err  = WaitForGPU();CHKERRCUDA(err);
1273       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1274       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1275       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1276 
1277       /* Update matT when it was built before */
1278       if (cusparsestruct->matTranspose) {
1279         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1280         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1281         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1282         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1283         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1284                                  A->cmap->n, mat->num_entries,
1285                                  mat->values->data().get(),
1286                                  cusparsestruct->rowoffsets_gpu->data().get(),
1287                                  mat->column_indices->data().get(),
1288                                  matT->values->data().get(),
1289                                  matT->column_indices->data().get(),
1290                                  matT->row_offsets->data().get(),
1291                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1292         err  = WaitForGPU();CHKERRCUDA(err);
1293         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1294         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1295       }
1296     } else {
1297       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1298       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1299       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1300       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1301       delete cusparsestruct->workVector;
1302       delete cusparsestruct->rowoffsets_gpu;
1303       try {
1304         if (a->compressedrow.use) {
1305           m    = a->compressedrow.nrows;
1306           ii   = a->compressedrow.i;
1307           ridx = a->compressedrow.rindex;
1308         } else {
1309           m    = A->rmap->n;
1310           ii   = a->i;
1311           ridx = NULL;
1312         }
1313         cusparsestruct->nrows = m;
1314 
1315         /* create cusparse matrix */
1316         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1317         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1318         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1319         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1320 
1321         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1322         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1323         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1324         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1325         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1326         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1327         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1328 
1329         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1330         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1331           /* set the matrix */
1332           CsrMatrix *matrix= new CsrMatrix;
1333           matrix->num_rows = m;
1334           matrix->num_cols = A->cmap->n;
1335           matrix->num_entries = a->nz;
1336           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1337           matrix->row_offsets->assign(ii, ii + m+1);
1338 
1339           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1340           matrix->column_indices->assign(a->j, a->j+a->nz);
1341 
1342           matrix->values = new THRUSTARRAY(a->nz);
1343           matrix->values->assign(a->a, a->a+a->nz);
1344 
1345           /* assign the pointer */
1346           matstruct->mat = matrix;
1347 
1348         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1349           CsrMatrix *matrix= new CsrMatrix;
1350           matrix->num_rows = m;
1351           matrix->num_cols = A->cmap->n;
1352           matrix->num_entries = a->nz;
1353           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1354           matrix->row_offsets->assign(ii, ii + m+1);
1355 
1356           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1357           matrix->column_indices->assign(a->j, a->j+a->nz);
1358 
1359           matrix->values = new THRUSTARRAY(a->nz);
1360           matrix->values->assign(a->a, a->a+a->nz);
1361 
1362           cusparseHybMat_t hybMat;
1363           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1364           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1365             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1366           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1367               matstruct->descr, matrix->values->data().get(),
1368               matrix->row_offsets->data().get(),
1369               matrix->column_indices->data().get(),
1370               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1371           /* assign the pointer */
1372           matstruct->mat = hybMat;
1373 
1374           if (matrix) {
1375             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1376             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1377             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1378             delete (CsrMatrix*)matrix;
1379           }
1380         }
1381 
1382         /* assign the compressed row indices */
1383         if (a->compressedrow.use) {
1384           cusparsestruct->workVector = new THRUSTARRAY(m);
1385           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1386           matstruct->cprowIndices->assign(ridx,ridx+m);
1387           tmp = m;
1388         } else {
1389           cusparsestruct->workVector = NULL;
1390           matstruct->cprowIndices    = NULL;
1391           tmp = 0;
1392         }
1393         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1394 
1395         /* assign the pointer */
1396         cusparsestruct->mat = matstruct;
1397       } catch(char *ex) {
1398         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1399       }
1400       err  = WaitForGPU();CHKERRCUDA(err);
1401       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1402       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1403       cusparsestruct->nonzerostate = A->nonzerostate;
1404     }
1405     A->offloadmask = PETSC_OFFLOAD_BOTH;
1406   }
1407   PetscFunctionReturn(0);
1408 }
1409 
1410 struct VecCUDAPlusEquals
1411 {
1412   template <typename Tuple>
1413   __host__ __device__
1414   void operator()(Tuple t)
1415   {
1416     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1417   }
1418 };
1419 
1420 struct VecCUDAEqualsReverse
1421 {
1422   template <typename Tuple>
1423   __host__ __device__
1424   void operator()(Tuple t)
1425   {
1426     thrust::get<0>(t) = thrust::get<1>(t);
1427   }
1428 };
1429 
1430 typedef struct {
1431   PetscBool   cisdense;
1432   PetscScalar *Bt;
1433   Mat         X;
1434 } MatMatCusparse;
1435 
1436 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1437 {
1438   PetscErrorCode ierr;
1439   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1440   cudaError_t    cerr;
1441 
1442   PetscFunctionBegin;
1443   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1444   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1445   ierr = PetscFree(data);CHKERRQ(ierr);
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1450 
1451 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1452 {
1453   Mat_Product                  *product = C->product;
1454   Mat                          A,B;
1455   PetscInt                     m,n,k,blda,clda;
1456   PetscBool                    flg,biscuda;
1457   Mat_SeqAIJCUSPARSE           *cusp;
1458   cusparseStatus_t             stat;
1459   cusparseOperation_t          opA;
1460   cusparseMatDescr_t           matA;
1461   const PetscScalar            *barray;
1462   PetscScalar                  *carray;
1463   PetscErrorCode               ierr;
1464   MatMatCusparse               *mmdata;
1465   Mat_SeqAIJCUSPARSEMultStruct *mat;
1466   CsrMatrix                    *csrmat;
1467   cudaError_t                  cuer;
1468 
1469   PetscFunctionBegin;
1470   MatCheckProduct(C,1);
1471   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1472   mmdata = (MatMatCusparse*)product->data;
1473   A    = product->A;
1474   B    = product->B;
1475   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1476   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1477   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1478      Instead of silently accepting the wrong answer, I prefer to raise the error */
1479   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1480   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1481   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1482   switch (product->type) {
1483   case MATPRODUCT_AB:
1484   case MATPRODUCT_PtAP:
1485     mat = cusp->mat;
1486     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1487     m   = A->rmap->n;
1488     k   = A->cmap->n;
1489     n   = B->cmap->n;
1490     break;
1491   case MATPRODUCT_AtB:
1492     if (!cusp->transgen) {
1493       mat = cusp->mat;
1494       opA = CUSPARSE_OPERATION_TRANSPOSE;
1495     } else {
1496       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1497       mat  = cusp->matTranspose;
1498       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1499     }
1500     m = A->cmap->n;
1501     k = A->rmap->n;
1502     n = B->cmap->n;
1503     break;
1504   case MATPRODUCT_ABt:
1505   case MATPRODUCT_RARt:
1506     mat = cusp->mat;
1507     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1508     m   = A->rmap->n;
1509     k   = B->cmap->n;
1510     n   = B->rmap->n;
1511     break;
1512   default:
1513     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1514   }
1515   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1516   matA   = mat->descr;
1517   csrmat = (CsrMatrix*)mat->mat;
1518   /* if the user passed a CPU matrix, copy the data to the GPU */
1519   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1520   if (!biscuda) {
1521     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1522   }
1523   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1524   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1525   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1526     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1527     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1528   } else {
1529     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1530     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1531   }
1532 
1533   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1534 
1535   /* cusparse spmm does not support transpose on B */
1536   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1537     cublasHandle_t cublasv2handle;
1538     cublasStatus_t cerr;
1539 
1540     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1541     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1542                        B->cmap->n,B->rmap->n,
1543                        &PETSC_CUSPARSE_ONE ,barray,blda,
1544                        &PETSC_CUSPARSE_ZERO,barray,blda,
1545                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1546     blda = B->cmap->n;
1547   }
1548 
1549   /* perform the MatMat operation */
1550   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1551                            csrmat->num_entries,mat->alpha,matA,
1552                            csrmat->values->data().get(),
1553                            csrmat->row_offsets->data().get(),
1554                            csrmat->column_indices->data().get(),
1555                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1556                            carray,clda);CHKERRCUSPARSE(stat);
1557   cuer = WaitForGPU();CHKERRCUDA(cuer);
1558   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1559   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1560   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1561   if (product->type == MATPRODUCT_RARt) {
1562     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1563     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1564   } else if (product->type == MATPRODUCT_PtAP) {
1565     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1566     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1567   } else {
1568     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1569   }
1570   if (mmdata->cisdense) {
1571     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1572   }
1573   if (!biscuda) {
1574     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1575   }
1576   PetscFunctionReturn(0);
1577 }
1578 
1579 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1580 {
1581   Mat_Product        *product = C->product;
1582   Mat                A,B;
1583   PetscInt           m,n;
1584   PetscBool          cisdense,flg;
1585   PetscErrorCode     ierr;
1586   MatMatCusparse     *mmdata;
1587   Mat_SeqAIJCUSPARSE *cusp;
1588 
1589   PetscFunctionBegin;
1590   MatCheckProduct(C,1);
1591   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1592   A    = product->A;
1593   B    = product->B;
1594   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1595   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1596   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1597   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1598   switch (product->type) {
1599   case MATPRODUCT_AB:
1600     m = A->rmap->n;
1601     n = B->cmap->n;
1602     break;
1603   case MATPRODUCT_AtB:
1604     m = A->cmap->n;
1605     n = B->cmap->n;
1606     break;
1607   case MATPRODUCT_ABt:
1608     m = A->rmap->n;
1609     n = B->rmap->n;
1610     break;
1611   case MATPRODUCT_PtAP:
1612     m = B->cmap->n;
1613     n = B->cmap->n;
1614     break;
1615   case MATPRODUCT_RARt:
1616     m = B->rmap->n;
1617     n = B->rmap->n;
1618     break;
1619   default:
1620     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1621   }
1622   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1623   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1624   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1625   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1626 
1627   /* product data */
1628   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1629   mmdata->cisdense = cisdense;
1630   /* cusparse spmm does not support transpose on B */
1631   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1632     cudaError_t cerr;
1633 
1634     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1635   }
1636   /* for these products we need intermediate storage */
1637   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1638     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1639     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1640     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1641       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1642     } else {
1643       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1644     }
1645   }
1646   C->product->data    = mmdata;
1647   C->product->destroy = MatDestroy_MatMatCusparse;
1648 
1649   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1650   PetscFunctionReturn(0);
1651 }
1652 
1653 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1654 
1655 /* handles dense B */
1656 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1657 {
1658   Mat_Product    *product = C->product;
1659   PetscErrorCode ierr;
1660 
1661   PetscFunctionBegin;
1662   MatCheckProduct(C,1);
1663   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1664   if (product->A->boundtocpu) {
1665     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1666     PetscFunctionReturn(0);
1667   }
1668   switch (product->type) {
1669   case MATPRODUCT_AB:
1670   case MATPRODUCT_AtB:
1671   case MATPRODUCT_ABt:
1672   case MATPRODUCT_PtAP:
1673   case MATPRODUCT_RARt:
1674     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1675   default:
1676     break;
1677   }
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1682 {
1683   PetscErrorCode ierr;
1684 
1685   PetscFunctionBegin;
1686   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1687   PetscFunctionReturn(0);
1688 }
1689 
1690 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1691 {
1692   PetscErrorCode ierr;
1693 
1694   PetscFunctionBegin;
1695   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1696   PetscFunctionReturn(0);
1697 }
1698 
1699 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1700 {
1701   PetscErrorCode ierr;
1702 
1703   PetscFunctionBegin;
1704   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1705   PetscFunctionReturn(0);
1706 }
1707 
1708 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1709 {
1710   PetscErrorCode ierr;
1711 
1712   PetscFunctionBegin;
1713   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1714   PetscFunctionReturn(0);
1715 }
1716 
1717 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1718 {
1719   PetscErrorCode ierr;
1720 
1721   PetscFunctionBegin;
1722   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
1727 {
1728   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1729   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1730   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1731   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1732   PetscErrorCode               ierr;
1733   cudaError_t                  cerr;
1734   cusparseStatus_t             stat;
1735   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1736   PetscBool                    compressed;
1737 
1738   PetscFunctionBegin;
1739   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1740   if (!a->nonzerorowcnt) {
1741     if (!yy) {
1742       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1743     }
1744     PetscFunctionReturn(0);
1745   }
1746   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1747   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1748   if (!trans) {
1749     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1750   } else {
1751     if (herm || !cusparsestruct->transgen) {
1752       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1753       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1754     } else {
1755       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1756       if (!matstruct) {
1757         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1758         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1759       }
1760     }
1761   }
1762   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1763   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1764 
1765   try {
1766     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1767     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1768     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1769     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1770     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1771       xptr = xarray;
1772       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1773       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1774     } else {
1775       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1776       dptr = zarray;
1777       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1778 
1779       if (compressed) {
1780         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1781 
1782         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1783                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1784                          VecCUDAEqualsReverse());
1785       }
1786     }
1787 
1788     /* csr_spmv does y = alpha*Ax + beta*y */
1789     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1790       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1791       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1792                                mat->num_rows, mat->num_cols,
1793                                mat->num_entries, matstruct->alpha, matstruct->descr,
1794                                mat->values->data().get(), mat->row_offsets->data().get(),
1795                                mat->column_indices->data().get(), xptr, beta,
1796                                dptr);CHKERRCUSPARSE(stat);
1797     } else {
1798       if (cusparsestruct->nrows) {
1799         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1800         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1801                                  matstruct->alpha, matstruct->descr, hybMat,
1802                                  xptr, beta,
1803                                  dptr);CHKERRCUSPARSE(stat);
1804       }
1805     }
1806     cerr = WaitForGPU();CHKERRCUDA(cerr);
1807     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1808 
1809     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1810       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1811         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1812           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1813         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1814           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1815         }
1816       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1817         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1818       }
1819 
1820       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1821       if (compressed) {
1822         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1823 
1824         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1825         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1826                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1827                          VecCUDAPlusEquals());
1828         cerr = WaitForGPU();CHKERRCUDA(cerr);
1829         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1830       }
1831     } else {
1832       if (yy && yy != zz) {
1833         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1834       }
1835     }
1836     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1837     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1838     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1839   } catch(char *ex) {
1840     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1841   }
1842   if (yy) {
1843     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1844   } else {
1845     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1846   }
1847   PetscFunctionReturn(0);
1848 }
1849 
1850 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1851 {
1852   PetscErrorCode ierr;
1853 
1854   PetscFunctionBegin;
1855   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1856   PetscFunctionReturn(0);
1857 }
1858 
1859 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1860 {
1861   PetscErrorCode ierr;
1862 
1863   PetscFunctionBegin;
1864   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1865   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1866   if (A->factortype == MAT_FACTOR_NONE) {
1867     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1868   }
1869   PetscFunctionReturn(0);
1870 }
1871 
1872 /* --------------------------------------------------------------------------------*/
1873 /*@
1874    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1875    (the default parallel PETSc format). This matrix will ultimately pushed down
1876    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1877    assembly performance the user should preallocate the matrix storage by setting
1878    the parameter nz (or the array nnz).  By setting these parameters accurately,
1879    performance during matrix assembly can be increased by more than a factor of 50.
1880 
1881    Collective
1882 
1883    Input Parameters:
1884 +  comm - MPI communicator, set to PETSC_COMM_SELF
1885 .  m - number of rows
1886 .  n - number of columns
1887 .  nz - number of nonzeros per row (same for all rows)
1888 -  nnz - array containing the number of nonzeros in the various rows
1889          (possibly different for each row) or NULL
1890 
1891    Output Parameter:
1892 .  A - the matrix
1893 
1894    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1895    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1896    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1897 
1898    Notes:
1899    If nnz is given then nz is ignored
1900 
1901    The AIJ format (also called the Yale sparse matrix format or
1902    compressed row storage), is fully compatible with standard Fortran 77
1903    storage.  That is, the stored row and column indices can begin at
1904    either one (as in Fortran) or zero.  See the users' manual for details.
1905 
1906    Specify the preallocated storage with either nz or nnz (not both).
1907    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1908    allocation.  For large problems you MUST preallocate memory or you
1909    will get TERRIBLE performance, see the users' manual chapter on matrices.
1910 
1911    By default, this format uses inodes (identical nodes) when possible, to
1912    improve numerical efficiency of matrix-vector products and solves. We
1913    search for consecutive rows with the same nonzero structure, thereby
1914    reusing matrix information to achieve increased efficiency.
1915 
1916    Level: intermediate
1917 
1918 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1919 @*/
1920 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1921 {
1922   PetscErrorCode ierr;
1923 
1924   PetscFunctionBegin;
1925   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1926   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1927   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1928   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1929   PetscFunctionReturn(0);
1930 }
1931 
1932 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1933 {
1934   PetscErrorCode ierr;
1935 
1936   PetscFunctionBegin;
1937   if (A->factortype == MAT_FACTOR_NONE) {
1938     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1939   } else {
1940     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1941   }
1942   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1943   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1944   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1945   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1946   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1951 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1952 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1953 {
1954   PetscErrorCode ierr;
1955 
1956   PetscFunctionBegin;
1957   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1958   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1959   PetscFunctionReturn(0);
1960 }
1961 
1962 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1963 {
1964   PetscErrorCode ierr;
1965 
1966   PetscFunctionBegin;
1967   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1968   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1969      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1970      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
1971      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1972            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1973   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1974   /* TODO: add support for this? */
1975   if (flg) {
1976     A->ops->mult                      = MatMult_SeqAIJ;
1977     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1978     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1979     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1980     A->ops->multhermitiantranspose    = NULL;
1981     A->ops->multhermitiantransposeadd = NULL;
1982     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1983     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1984   } else {
1985     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
1986     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
1987     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
1988     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1989     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
1990     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
1991     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
1992     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
1993   }
1994   A->boundtocpu = flg;
1995   PetscFunctionReturn(0);
1996 }
1997 
1998 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1999 {
2000   PetscErrorCode   ierr;
2001   cusparseStatus_t stat;
2002   Mat              B;
2003 
2004   PetscFunctionBegin;
2005   if (reuse == MAT_INITIAL_MATRIX) {
2006     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2007   } else if (reuse == MAT_REUSE_MATRIX) {
2008     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2009   }
2010   B = *newmat;
2011 
2012   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2013   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2014 
2015   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2016     if (B->factortype == MAT_FACTOR_NONE) {
2017       Mat_SeqAIJCUSPARSE *spptr;
2018 
2019       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2020       spptr->format = MAT_CUSPARSE_CSR;
2021       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2022       B->spptr = spptr;
2023     } else {
2024       Mat_SeqAIJCUSPARSETriFactors *spptr;
2025 
2026       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2027       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2028       B->spptr = spptr;
2029     }
2030     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2031   }
2032   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2033   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2034   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2035   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2036   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2037 
2038   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2039   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2040   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2041   PetscFunctionReturn(0);
2042 }
2043 
2044 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2045 {
2046   PetscErrorCode ierr;
2047 
2048   PetscFunctionBegin;
2049   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2050   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2051   PetscFunctionReturn(0);
2052 }
2053 
2054 /*MC
2055    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2056 
2057    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2058    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2059    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2060 
2061    Options Database Keys:
2062 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2063 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2064 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2065 
2066   Level: beginner
2067 
2068 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2069 M*/
2070 
2071 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2072 
2073 
2074 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2075 {
2076   PetscErrorCode ierr;
2077 
2078   PetscFunctionBegin;
2079   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2080   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2081   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2082   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2083   PetscFunctionReturn(0);
2084 }
2085 
2086 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2087 {
2088   PetscErrorCode   ierr;
2089   cusparseStatus_t stat;
2090   cusparseHandle_t handle;
2091 
2092   PetscFunctionBegin;
2093   if (*cusparsestruct) {
2094     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2095     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2096     delete (*cusparsestruct)->workVector;
2097     delete (*cusparsestruct)->rowoffsets_gpu;
2098     if (handle = (*cusparsestruct)->handle) {
2099       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2100     }
2101     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2102   }
2103   PetscFunctionReturn(0);
2104 }
2105 
2106 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2107 {
2108   PetscFunctionBegin;
2109   if (*mat) {
2110     delete (*mat)->values;
2111     delete (*mat)->column_indices;
2112     delete (*mat)->row_offsets;
2113     delete *mat;
2114     *mat = 0;
2115   }
2116   PetscFunctionReturn(0);
2117 }
2118 
2119 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2120 {
2121   cusparseStatus_t stat;
2122   PetscErrorCode   ierr;
2123 
2124   PetscFunctionBegin;
2125   if (*trifactor) {
2126     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2127     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2128     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2129     delete *trifactor;
2130     *trifactor = 0;
2131   }
2132   PetscFunctionReturn(0);
2133 }
2134 
2135 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2136 {
2137   CsrMatrix        *mat;
2138   cusparseStatus_t stat;
2139   cudaError_t      err;
2140 
2141   PetscFunctionBegin;
2142   if (*matstruct) {
2143     if ((*matstruct)->mat) {
2144       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2145         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2146         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2147       } else {
2148         mat = (CsrMatrix*)(*matstruct)->mat;
2149         CsrMatrix_Destroy(&mat);
2150       }
2151     }
2152     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2153     delete (*matstruct)->cprowIndices;
2154     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2155     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2156     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2157     delete *matstruct;
2158     *matstruct = 0;
2159   }
2160   PetscFunctionReturn(0);
2161 }
2162 
2163 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2164 {
2165   PetscErrorCode ierr;
2166 
2167   PetscFunctionBegin;
2168   if (*trifactors) {
2169     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2170     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2171     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2172     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2173     delete (*trifactors)->rpermIndices;
2174     delete (*trifactors)->cpermIndices;
2175     delete (*trifactors)->workVector;
2176     (*trifactors)->rpermIndices = 0;
2177     (*trifactors)->cpermIndices = 0;
2178     (*trifactors)->workVector = 0;
2179   }
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2184 {
2185   PetscErrorCode   ierr;
2186   cusparseHandle_t handle;
2187   cusparseStatus_t stat;
2188 
2189   PetscFunctionBegin;
2190   if (*trifactors) {
2191     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2192     if (handle = (*trifactors)->handle) {
2193       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2194     }
2195     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2196   }
2197   PetscFunctionReturn(0);
2198 }
2199