xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a32341863366332f7a65d929fce33ce85c8bfbdc)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
39 
40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
46 
47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48 {
49   cusparseStatus_t   stat;
50   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
51 
52   PetscFunctionBegin;
53   cusparsestruct->stream = stream;
54   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55   PetscFunctionReturn(0);
56 }
57 
58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59 {
60   cusparseStatus_t   stat;
61   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62 
63   PetscFunctionBegin;
64   if (cusparsestruct->handle != handle) {
65     if (cusparsestruct->handle) {
66       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
67     }
68     cusparsestruct->handle = handle;
69   }
70   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75 {
76   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
77 
78   PetscFunctionBegin;
79   if (cusparsestruct->handle) cusparsestruct->handle = 0;
80   PetscFunctionReturn(0);
81 }
82 
83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
129 {
130   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
131 
132   PetscFunctionBegin;
133   switch (op) {
134   case MAT_CUSPARSE_MULT:
135     cusparsestruct->format = format;
136     break;
137   case MAT_CUSPARSE_ALL:
138     cusparsestruct->format = format;
139     break;
140   default:
141     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
142   }
143   PetscFunctionReturn(0);
144 }
145 
146 /*@
147    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
148    operation. Only the MatMult operation can use different GPU storage formats
149    for MPIAIJCUSPARSE matrices.
150    Not Collective
151 
152    Input Parameters:
153 +  A - Matrix of type SEQAIJCUSPARSE
154 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
155 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
156 
157    Output Parameter:
158 
159    Level: intermediate
160 
161 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
162 @*/
163 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
164 {
165   PetscErrorCode ierr;
166 
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 /*@
174    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
175 
176    Collective on mat
177 
178    Input Parameters:
179 +  A - Matrix of type SEQAIJCUSPARSE
180 -  transgen - the boolean flag
181 
182    Level: intermediate
183 
184 .seealso: MATSEQAIJCUSPARSE
185 @*/
186 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
187 {
188   PetscErrorCode ierr;
189   PetscBool      flg;
190 
191   PetscFunctionBegin;
192   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
193   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
194   if (flg) {
195     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
196     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
197     cusp->transgen = transgen;
198   }
199   PetscFunctionReturn(0);
200 }
201 
202 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
203 {
204   PetscErrorCode           ierr;
205   MatCUSPARSEStorageFormat format;
206   PetscBool                flg;
207   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
208 
209   PetscFunctionBegin;
210   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
211   if (A->factortype == MAT_FACTOR_NONE) {
212     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",cusparsestruct->transgen,&cusparsestruct->transgen,NULL);CHKERRQ(ierr);
213     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
214                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
215     if (flg) {
216       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
217     }
218   }
219   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
220                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
221   if (flg) {
222     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
223   }
224   ierr = PetscOptionsTail();CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
229 {
230   PetscErrorCode ierr;
231 
232   PetscFunctionBegin;
233   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
234   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
244   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
254   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
255   PetscFunctionReturn(0);
256 }
257 
258 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259 {
260   PetscErrorCode ierr;
261 
262   PetscFunctionBegin;
263   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
264   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265   PetscFunctionReturn(0);
266 }
267 
268 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
269 {
270   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
271   PetscInt                          n = A->rmap->n;
272   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
273   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
274   cusparseStatus_t                  stat;
275   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
276   const MatScalar                   *aa = a->a,*v;
277   PetscInt                          *AiLo, *AjLo;
278   PetscScalar                       *AALo;
279   PetscInt                          i,nz, nzLower, offset, rowOffset;
280   PetscErrorCode                    ierr;
281   cudaError_t                       cerr;
282 
283   PetscFunctionBegin;
284   if (!n) PetscFunctionReturn(0);
285   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
286     try {
287       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
288       nzLower=n+ai[n]-ai[1];
289 
290       /* Allocate Space for the lower triangular matrix */
291       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
292       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
293       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
294 
295       /* Fill the lower triangular matrix */
296       AiLo[0]  = (PetscInt) 0;
297       AiLo[n]  = nzLower;
298       AjLo[0]  = (PetscInt) 0;
299       AALo[0]  = (MatScalar) 1.0;
300       v        = aa;
301       vi       = aj;
302       offset   = 1;
303       rowOffset= 1;
304       for (i=1; i<n; i++) {
305         nz = ai[i+1] - ai[i];
306         /* additional 1 for the term on the diagonal */
307         AiLo[i]    = rowOffset;
308         rowOffset += nz+1;
309 
310         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
311         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
312 
313         offset      += nz;
314         AjLo[offset] = (PetscInt) i;
315         AALo[offset] = (MatScalar) 1.0;
316         offset      += 1;
317 
318         v  += nz;
319         vi += nz;
320       }
321 
322       /* allocate space for the triangular factor information */
323       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
324 
325       /* Create the matrix description */
326       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
327       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
328       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
329       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
330       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
331 
332       /* Create the solve analysis information */
333       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
334 
335       /* set the operation */
336       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
337 
338       /* set the matrix */
339       loTriFactor->csrMat = new CsrMatrix;
340       loTriFactor->csrMat->num_rows = n;
341       loTriFactor->csrMat->num_cols = n;
342       loTriFactor->csrMat->num_entries = nzLower;
343 
344       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
345       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
346 
347       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
348       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
349 
350       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
351       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
352 
353       /* perform the solve analysis */
354       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
358 
359       /* assign the pointer. Is this really necessary? */
360       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
361 
362       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
363       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
364       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
365       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
366     } catch(char *ex) {
367       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
368     }
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
374 {
375   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
376   PetscInt                          n = A->rmap->n;
377   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
378   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
379   cusparseStatus_t                  stat;
380   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
381   const MatScalar                   *aa = a->a,*v;
382   PetscInt                          *AiUp, *AjUp;
383   PetscScalar                       *AAUp;
384   PetscInt                          i,nz, nzUpper, offset;
385   PetscErrorCode                    ierr;
386   cudaError_t                       cerr;
387 
388   PetscFunctionBegin;
389   if (!n) PetscFunctionReturn(0);
390   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
391     try {
392       /* next, figure out the number of nonzeros in the upper triangular matrix. */
393       nzUpper = adiag[0]-adiag[n];
394 
395       /* Allocate Space for the upper triangular matrix */
396       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
397       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
398       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
399 
400       /* Fill the upper triangular matrix */
401       AiUp[0]=(PetscInt) 0;
402       AiUp[n]=nzUpper;
403       offset = nzUpper;
404       for (i=n-1; i>=0; i--) {
405         v  = aa + adiag[i+1] + 1;
406         vi = aj + adiag[i+1] + 1;
407 
408         /* number of elements NOT on the diagonal */
409         nz = adiag[i] - adiag[i+1]-1;
410 
411         /* decrement the offset */
412         offset -= (nz+1);
413 
414         /* first, set the diagonal elements */
415         AjUp[offset] = (PetscInt) i;
416         AAUp[offset] = (MatScalar)1./v[nz];
417         AiUp[i]      = AiUp[i+1] - (nz+1);
418 
419         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
420         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
421       }
422 
423       /* allocate space for the triangular factor information */
424       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
425 
426       /* Create the matrix description */
427       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
428       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
429       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
430       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
431       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
432 
433       /* Create the solve analysis information */
434       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
435 
436       /* set the operation */
437       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438 
439       /* set the matrix */
440       upTriFactor->csrMat = new CsrMatrix;
441       upTriFactor->csrMat->num_rows = n;
442       upTriFactor->csrMat->num_cols = n;
443       upTriFactor->csrMat->num_entries = nzUpper;
444 
445       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
447 
448       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
449       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
450 
451       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
452       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
453 
454       /* perform the solve analysis */
455       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
456                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
457                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
458                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459 
460       /* assign the pointer. Is this really necessary? */
461       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
462 
463       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
464       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
465       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
466       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
467     } catch(char *ex) {
468       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
469     }
470   }
471   PetscFunctionReturn(0);
472 }
473 
474 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
475 {
476   PetscErrorCode               ierr;
477   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
478   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   IS                           isrow = a->row,iscol = a->icol;
480   PetscBool                    row_identity,col_identity;
481   const PetscInt               *r,*c;
482   PetscInt                     n = A->rmap->n;
483 
484   PetscFunctionBegin;
485   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
486   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
487   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
488 
489   cusparseTriFactors->workVector = new THRUSTARRAY(n);
490   cusparseTriFactors->nnz=a->nz;
491 
492   A->offloadmask = PETSC_OFFLOAD_BOTH;
493   /* lower triangular indices */
494   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
495   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
496   if (!row_identity) {
497     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
498     cusparseTriFactors->rpermIndices->assign(r, r+n);
499   }
500   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
501 
502   /* upper triangular indices */
503   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
504   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
505   if (!col_identity) {
506     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
507     cusparseTriFactors->cpermIndices->assign(c, c+n);
508   }
509 
510   if (!row_identity && !col_identity) {
511     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
512   } else if(!row_identity) {
513     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
514   } else if(!col_identity) {
515     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
516   }
517 
518   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
519   PetscFunctionReturn(0);
520 }
521 
522 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
523 {
524   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
525   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
526   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
527   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
528   cusparseStatus_t                  stat;
529   PetscErrorCode                    ierr;
530   cudaError_t                       cerr;
531   PetscInt                          *AiUp, *AjUp;
532   PetscScalar                       *AAUp;
533   PetscScalar                       *AALo;
534   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
535   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
536   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
537   const MatScalar                   *aa = b->a,*v;
538 
539   PetscFunctionBegin;
540   if (!n) PetscFunctionReturn(0);
541   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
542     try {
543       /* Allocate Space for the upper triangular matrix */
544       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
545       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
546       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
547       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
548 
549       /* Fill the upper triangular matrix */
550       AiUp[0]=(PetscInt) 0;
551       AiUp[n]=nzUpper;
552       offset = 0;
553       for (i=0; i<n; i++) {
554         /* set the pointers */
555         v  = aa + ai[i];
556         vj = aj + ai[i];
557         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
558 
559         /* first, set the diagonal elements */
560         AjUp[offset] = (PetscInt) i;
561         AAUp[offset] = (MatScalar)1.0/v[nz];
562         AiUp[i]      = offset;
563         AALo[offset] = (MatScalar)1.0/v[nz];
564 
565         offset+=1;
566         if (nz>0) {
567           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
568           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
569           for (j=offset; j<offset+nz; j++) {
570             AAUp[j] = -AAUp[j];
571             AALo[j] = AAUp[j]/v[nz];
572           }
573           offset+=nz;
574         }
575       }
576 
577       /* allocate space for the triangular factor information */
578       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
579 
580       /* Create the matrix description */
581       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
582       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
583       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
584       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
585       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
586 
587       /* Create the solve analysis information */
588       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
589 
590       /* set the operation */
591       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
592 
593       /* set the matrix */
594       upTriFactor->csrMat = new CsrMatrix;
595       upTriFactor->csrMat->num_rows = A->rmap->n;
596       upTriFactor->csrMat->num_cols = A->cmap->n;
597       upTriFactor->csrMat->num_entries = a->nz;
598 
599       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
600       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
601 
602       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
603       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
604 
605       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
606       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
607 
608       /* perform the solve analysis */
609       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
613 
614       /* assign the pointer. Is this really necessary? */
615       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
616 
617       /* allocate space for the triangular factor information */
618       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
619 
620       /* Create the matrix description */
621       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
622       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
623       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
624       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
625       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
626 
627       /* Create the solve analysis information */
628       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
629 
630       /* set the operation */
631       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
632 
633       /* set the matrix */
634       loTriFactor->csrMat = new CsrMatrix;
635       loTriFactor->csrMat->num_rows = A->rmap->n;
636       loTriFactor->csrMat->num_cols = A->cmap->n;
637       loTriFactor->csrMat->num_entries = a->nz;
638 
639       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
640       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
641 
642       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
643       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
644 
645       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
646       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
647       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
648 
649       /* perform the solve analysis */
650       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
651                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
652                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
653                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
654 
655       /* assign the pointer. Is this really necessary? */
656       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
657 
658       A->offloadmask = PETSC_OFFLOAD_BOTH;
659       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
660       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
661       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
662       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
663     } catch(char *ex) {
664       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
665     }
666   }
667   PetscFunctionReturn(0);
668 }
669 
670 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
671 {
672   PetscErrorCode               ierr;
673   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
674   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
675   IS                           ip = a->row;
676   const PetscInt               *rip;
677   PetscBool                    perm_identity;
678   PetscInt                     n = A->rmap->n;
679 
680   PetscFunctionBegin;
681   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
682   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
683   cusparseTriFactors->workVector = new THRUSTARRAY(n);
684   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
685 
686   /* lower triangular indices */
687   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
688   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
689   if (!perm_identity) {
690     IS             iip;
691     const PetscInt *irip;
692 
693     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
694     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
695     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
696     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
697     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
699     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
700     ierr = ISDestroy(&iip);CHKERRQ(ierr);
701     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
702   }
703   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
707 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
708 {
709   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
710   IS             isrow = b->row,iscol = b->col;
711   PetscBool      row_identity,col_identity;
712   PetscErrorCode ierr;
713 
714   PetscFunctionBegin;
715   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
716   B->offloadmask = PETSC_OFFLOAD_CPU;
717   /* determine which version of MatSolve needs to be used. */
718   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
719   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
720   if (row_identity && col_identity) {
721     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
722     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
723     B->ops->matsolve = NULL;
724     B->ops->matsolvetranspose = NULL;
725   } else {
726     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
727     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
728     B->ops->matsolve = NULL;
729     B->ops->matsolvetranspose = NULL;
730   }
731 
732   /* get the triangular factors */
733   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
738 {
739   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
740   IS             ip = b->row;
741   PetscBool      perm_identity;
742   PetscErrorCode ierr;
743 
744   PetscFunctionBegin;
745   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
746   B->offloadmask = PETSC_OFFLOAD_CPU;
747   /* determine which version of MatSolve needs to be used. */
748   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
749   if (perm_identity) {
750     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
751     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
752     B->ops->matsolve = NULL;
753     B->ops->matsolvetranspose = NULL;
754   } else {
755     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
756     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
757     B->ops->matsolve = NULL;
758     B->ops->matsolvetranspose = NULL;
759   }
760 
761   /* get the triangular factors */
762   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
763   PetscFunctionReturn(0);
764 }
765 
766 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
767 {
768   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
769   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
770   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
771   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
772   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
773   cusparseStatus_t                  stat;
774   cusparseIndexBase_t               indexBase;
775   cusparseMatrixType_t              matrixType;
776   cusparseFillMode_t                fillMode;
777   cusparseDiagType_t                diagType;
778 
779   PetscFunctionBegin;
780 
781   /*********************************************/
782   /* Now the Transpose of the Lower Tri Factor */
783   /*********************************************/
784 
785   /* allocate space for the transpose of the lower triangular factor */
786   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
787 
788   /* set the matrix descriptors of the lower triangular factor */
789   matrixType = cusparseGetMatType(loTriFactor->descr);
790   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
791   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
792     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
793   diagType = cusparseGetMatDiagType(loTriFactor->descr);
794 
795   /* Create the matrix description */
796   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
797   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
798   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
799   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
800   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
801 
802   /* Create the solve analysis information */
803   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
804 
805   /* set the operation */
806   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
807 
808   /* allocate GPU space for the CSC of the lower triangular factor*/
809   loTriFactorT->csrMat = new CsrMatrix;
810   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
811   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
812   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
813   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
814   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
815   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
816 
817   /* compute the transpose of the lower triangular factor, i.e. the CSC */
818   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
819                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
820                           loTriFactor->csrMat->values->data().get(),
821                           loTriFactor->csrMat->row_offsets->data().get(),
822                           loTriFactor->csrMat->column_indices->data().get(),
823                           loTriFactorT->csrMat->values->data().get(),
824                           loTriFactorT->csrMat->column_indices->data().get(),
825                           loTriFactorT->csrMat->row_offsets->data().get(),
826                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
827 
828   /* perform the solve analysis on the transposed matrix */
829   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
830                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
831                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
832                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
833                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
834 
835   /* assign the pointer. Is this really necessary? */
836   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
837 
838   /*********************************************/
839   /* Now the Transpose of the Upper Tri Factor */
840   /*********************************************/
841 
842   /* allocate space for the transpose of the upper triangular factor */
843   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
844 
845   /* set the matrix descriptors of the upper triangular factor */
846   matrixType = cusparseGetMatType(upTriFactor->descr);
847   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
848   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
849     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
850   diagType = cusparseGetMatDiagType(upTriFactor->descr);
851 
852   /* Create the matrix description */
853   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
854   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
855   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
856   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
857   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
858 
859   /* Create the solve analysis information */
860   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
861 
862   /* set the operation */
863   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
864 
865   /* allocate GPU space for the CSC of the upper triangular factor*/
866   upTriFactorT->csrMat = new CsrMatrix;
867   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
868   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
869   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
870   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
871   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
872   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
873 
874   /* compute the transpose of the upper triangular factor, i.e. the CSC */
875   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
876                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
877                           upTriFactor->csrMat->values->data().get(),
878                           upTriFactor->csrMat->row_offsets->data().get(),
879                           upTriFactor->csrMat->column_indices->data().get(),
880                           upTriFactorT->csrMat->values->data().get(),
881                           upTriFactorT->csrMat->column_indices->data().get(),
882                           upTriFactorT->csrMat->row_offsets->data().get(),
883                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
884 
885   /* perform the solve analysis on the transposed matrix */
886   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
887                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
888                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
889                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
890                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
891 
892   /* assign the pointer. Is this really necessary? */
893   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
894   PetscFunctionReturn(0);
895 }
896 
897 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
898 {
899   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
900   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
901   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
902   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
903   cusparseStatus_t             stat;
904   cusparseIndexBase_t          indexBase;
905   cudaError_t                  err;
906   PetscErrorCode               ierr;
907 
908   PetscFunctionBegin;
909   if (!cusparsestruct->transgen) PetscFunctionReturn(0);
910   /* allocate space for the triangular factor information */
911   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
912   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
913   indexBase = cusparseGetMatIndexBase(matstruct->descr);
914   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
915   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
916 
917   /* set alpha and beta */
918   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
919   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
920   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
921   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
922   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
923   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
924   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
925 
926   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
927     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
928     CsrMatrix *matrixT= new CsrMatrix;
929     matrixT->num_rows = A->cmap->n;
930     matrixT->num_cols = A->rmap->n;
931     matrixT->num_entries = a->nz;
932     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
933     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
934     matrixT->values = new THRUSTARRAY(a->nz);
935 
936     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
937     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
938     /* compute the transpose, i.e. the CSC */
939     indexBase = cusparseGetMatIndexBase(matstruct->descr);
940     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
941                             A->cmap->n, matrix->num_entries,
942                             matrix->values->data().get(),
943                             cusparsestruct->rowoffsets_gpu->data().get(),
944                             matrix->column_indices->data().get(),
945                             matrixT->values->data().get(),
946                             matrixT->column_indices->data().get(),
947                             matrixT->row_offsets->data().get(),
948                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
949     /* assign the pointer */
950     matstructT->mat = matrixT;
951     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
952   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
953     /* First convert HYB to CSR */
954     CsrMatrix *temp= new CsrMatrix;
955     temp->num_rows = A->rmap->n;
956     temp->num_cols = A->cmap->n;
957     temp->num_entries = a->nz;
958     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
959     temp->column_indices = new THRUSTINTARRAY32(a->nz);
960     temp->values = new THRUSTARRAY(a->nz);
961 
962 
963     stat = cusparse_hyb2csr(cusparsestruct->handle,
964                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
965                             temp->values->data().get(),
966                             temp->row_offsets->data().get(),
967                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
968 
969     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
970     CsrMatrix *tempT= new CsrMatrix;
971     tempT->num_rows = A->rmap->n;
972     tempT->num_cols = A->cmap->n;
973     tempT->num_entries = a->nz;
974     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
975     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
976     tempT->values = new THRUSTARRAY(a->nz);
977 
978     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
979                             temp->num_cols, temp->num_entries,
980                             temp->values->data().get(),
981                             temp->row_offsets->data().get(),
982                             temp->column_indices->data().get(),
983                             tempT->values->data().get(),
984                             tempT->column_indices->data().get(),
985                             tempT->row_offsets->data().get(),
986                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
987 
988     /* Last, convert CSC to HYB */
989     cusparseHybMat_t hybMat;
990     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
991     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
992       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
993     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
994                             matstructT->descr, tempT->values->data().get(),
995                             tempT->row_offsets->data().get(),
996                             tempT->column_indices->data().get(),
997                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
998 
999     /* assign the pointer */
1000     matstructT->mat = hybMat;
1001     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
1002 
1003     /* delete temporaries */
1004     if (tempT) {
1005       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1006       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1007       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1008       delete (CsrMatrix*) tempT;
1009     }
1010     if (temp) {
1011       if (temp->values) delete (THRUSTARRAY*) temp->values;
1012       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1013       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1014       delete (CsrMatrix*) temp;
1015     }
1016   }
1017   /* the compressed row indices is not used for matTranspose */
1018   matstructT->cprowIndices = NULL;
1019   /* assign the pointer */
1020   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1025 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1026 {
1027   PetscInt                              n = xx->map->n;
1028   const PetscScalar                     *barray;
1029   PetscScalar                           *xarray;
1030   thrust::device_ptr<const PetscScalar> bGPU;
1031   thrust::device_ptr<PetscScalar>       xGPU;
1032   cusparseStatus_t                      stat;
1033   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1034   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1035   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1036   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1037   PetscErrorCode                        ierr;
1038   cudaError_t                           cerr;
1039 
1040   PetscFunctionBegin;
1041   /* Analyze the matrix and create the transpose ... on the fly */
1042   if (!loTriFactorT && !upTriFactorT) {
1043     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1044     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1045     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1046   }
1047 
1048   /* Get the GPU pointers */
1049   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1050   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1051   xGPU = thrust::device_pointer_cast(xarray);
1052   bGPU = thrust::device_pointer_cast(barray);
1053 
1054   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1055   /* First, reorder with the row permutation */
1056   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1057                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1058                xGPU);
1059 
1060   /* First, solve U */
1061   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1062                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1063                         upTriFactorT->csrMat->values->data().get(),
1064                         upTriFactorT->csrMat->row_offsets->data().get(),
1065                         upTriFactorT->csrMat->column_indices->data().get(),
1066                         upTriFactorT->solveInfo,
1067                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1068 
1069   /* Then, solve L */
1070   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1071                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1072                         loTriFactorT->csrMat->values->data().get(),
1073                         loTriFactorT->csrMat->row_offsets->data().get(),
1074                         loTriFactorT->csrMat->column_indices->data().get(),
1075                         loTriFactorT->solveInfo,
1076                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1077 
1078   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1079   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1080                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1081                tempGPU->begin());
1082 
1083   /* Copy the temporary to the full solution. */
1084   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1085 
1086   /* restore */
1087   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1088   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1089   cerr = WaitForGPU();CHKERRCUDA(cerr);
1090   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1091   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1096 {
1097   const PetscScalar                 *barray;
1098   PetscScalar                       *xarray;
1099   cusparseStatus_t                  stat;
1100   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1101   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1102   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1103   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1104   PetscErrorCode                    ierr;
1105   cudaError_t                       cerr;
1106 
1107   PetscFunctionBegin;
1108   /* Analyze the matrix and create the transpose ... on the fly */
1109   if (!loTriFactorT && !upTriFactorT) {
1110     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1111     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1112     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1113   }
1114 
1115   /* Get the GPU pointers */
1116   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1117   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1118 
1119   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1120   /* First, solve U */
1121   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1122                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1123                         upTriFactorT->csrMat->values->data().get(),
1124                         upTriFactorT->csrMat->row_offsets->data().get(),
1125                         upTriFactorT->csrMat->column_indices->data().get(),
1126                         upTriFactorT->solveInfo,
1127                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1128 
1129   /* Then, solve L */
1130   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1131                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1132                         loTriFactorT->csrMat->values->data().get(),
1133                         loTriFactorT->csrMat->row_offsets->data().get(),
1134                         loTriFactorT->csrMat->column_indices->data().get(),
1135                         loTriFactorT->solveInfo,
1136                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1137 
1138   /* restore */
1139   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1140   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1141   cerr = WaitForGPU();CHKERRCUDA(cerr);
1142   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1143   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1144   PetscFunctionReturn(0);
1145 }
1146 
1147 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1148 {
1149   const PetscScalar                     *barray;
1150   PetscScalar                           *xarray;
1151   thrust::device_ptr<const PetscScalar> bGPU;
1152   thrust::device_ptr<PetscScalar>       xGPU;
1153   cusparseStatus_t                      stat;
1154   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1155   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1156   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1157   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1158   PetscErrorCode                        ierr;
1159   cudaError_t                           cerr;
1160 
1161   PetscFunctionBegin;
1162 
1163   /* Get the GPU pointers */
1164   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1165   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1166   xGPU = thrust::device_pointer_cast(xarray);
1167   bGPU = thrust::device_pointer_cast(barray);
1168 
1169   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1170   /* First, reorder with the row permutation */
1171   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1172                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1173                tempGPU->begin());
1174 
1175   /* Next, solve L */
1176   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1177                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1178                         loTriFactor->csrMat->values->data().get(),
1179                         loTriFactor->csrMat->row_offsets->data().get(),
1180                         loTriFactor->csrMat->column_indices->data().get(),
1181                         loTriFactor->solveInfo,
1182                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1183 
1184   /* Then, solve U */
1185   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1186                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1187                         upTriFactor->csrMat->values->data().get(),
1188                         upTriFactor->csrMat->row_offsets->data().get(),
1189                         upTriFactor->csrMat->column_indices->data().get(),
1190                         upTriFactor->solveInfo,
1191                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1192 
1193   /* Last, reorder with the column permutation */
1194   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1195                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1196                xGPU);
1197 
1198   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1199   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1200   cerr = WaitForGPU();CHKERRCUDA(cerr);
1201   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1202   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1203   PetscFunctionReturn(0);
1204 }
1205 
1206 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1207 {
1208   const PetscScalar                 *barray;
1209   PetscScalar                       *xarray;
1210   cusparseStatus_t                  stat;
1211   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1212   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1213   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1214   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1215   PetscErrorCode                    ierr;
1216   cudaError_t                       cerr;
1217 
1218   PetscFunctionBegin;
1219   /* Get the GPU pointers */
1220   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1221   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1222 
1223   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1224   /* First, solve L */
1225   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1226                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1227                         loTriFactor->csrMat->values->data().get(),
1228                         loTriFactor->csrMat->row_offsets->data().get(),
1229                         loTriFactor->csrMat->column_indices->data().get(),
1230                         loTriFactor->solveInfo,
1231                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1232 
1233   /* Next, solve U */
1234   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1235                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1236                         upTriFactor->csrMat->values->data().get(),
1237                         upTriFactor->csrMat->row_offsets->data().get(),
1238                         upTriFactor->csrMat->column_indices->data().get(),
1239                         upTriFactor->solveInfo,
1240                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1241 
1242   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1243   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1244   cerr = WaitForGPU();CHKERRCUDA(cerr);
1245   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1246   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1247   PetscFunctionReturn(0);
1248 }
1249 
1250 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1251 {
1252   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1253   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1254   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1255   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1256   PetscErrorCode               ierr;
1257   cusparseStatus_t             stat;
1258   cudaError_t                  err;
1259 
1260   PetscFunctionBegin;
1261   if (A->boundtocpu) PetscFunctionReturn(0);
1262   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1263     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1264     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1265       /* Copy values only */
1266       CsrMatrix *mat,*matT;
1267       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1268       mat->values->assign(a->a, a->a+a->nz);
1269       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1270 
1271       /* Update matT when it was built before */
1272       if (cusparsestruct->matTranspose) {
1273         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1274         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1275         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1276                                  A->cmap->n, mat->num_entries,
1277                                  mat->values->data().get(),
1278                                  cusparsestruct->rowoffsets_gpu->data().get(),
1279                                  mat->column_indices->data().get(),
1280                                  matT->values->data().get(),
1281                                  matT->column_indices->data().get(),
1282                                  matT->row_offsets->data().get(),
1283                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1284       }
1285     } else {
1286       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1287       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1288       delete cusparsestruct->workVector;
1289       delete cusparsestruct->rowoffsets_gpu;
1290       try {
1291         if (a->compressedrow.use) {
1292           m    = a->compressedrow.nrows;
1293           ii   = a->compressedrow.i;
1294           ridx = a->compressedrow.rindex;
1295         } else {
1296           m    = A->rmap->n;
1297           ii   = a->i;
1298           ridx = NULL;
1299         }
1300         cusparsestruct->nrows = m;
1301 
1302         /* allocate space for the triangular factor information */
1303         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1304         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1305         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1306         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1307 
1308         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1309         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1310         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1311         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1312         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1313         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1314         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1315 
1316         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1317         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1318           /* set the matrix */
1319           CsrMatrix *matrix= new CsrMatrix;
1320           matrix->num_rows = m;
1321           matrix->num_cols = A->cmap->n;
1322           matrix->num_entries = a->nz;
1323           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1324           matrix->row_offsets->assign(ii, ii + m+1);
1325 
1326           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1327           matrix->column_indices->assign(a->j, a->j+a->nz);
1328 
1329           matrix->values = new THRUSTARRAY(a->nz);
1330           matrix->values->assign(a->a, a->a+a->nz);
1331 
1332           /* assign the pointer */
1333           matstruct->mat = matrix;
1334 
1335         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1336           CsrMatrix *matrix= new CsrMatrix;
1337           matrix->num_rows = m;
1338           matrix->num_cols = A->cmap->n;
1339           matrix->num_entries = a->nz;
1340           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1341           matrix->row_offsets->assign(ii, ii + m+1);
1342 
1343           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1344           matrix->column_indices->assign(a->j, a->j+a->nz);
1345 
1346           matrix->values = new THRUSTARRAY(a->nz);
1347           matrix->values->assign(a->a, a->a+a->nz);
1348 
1349           cusparseHybMat_t hybMat;
1350           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1351           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1352             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1353           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1354               matstruct->descr, matrix->values->data().get(),
1355               matrix->row_offsets->data().get(),
1356               matrix->column_indices->data().get(),
1357               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1358           /* assign the pointer */
1359           matstruct->mat = hybMat;
1360 
1361           if (matrix) {
1362             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1363             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1364             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1365             delete (CsrMatrix*)matrix;
1366           }
1367         }
1368 
1369         /* assign the compressed row indices */
1370         if (a->compressedrow.use) {
1371           cusparsestruct->workVector = new THRUSTARRAY(m);
1372           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1373           matstruct->cprowIndices->assign(ridx,ridx+m);
1374           tmp = m;
1375         } else {
1376           cusparsestruct->workVector = NULL;
1377           matstruct->cprowIndices    = NULL;
1378           tmp = 0;
1379         }
1380         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1381 
1382         /* assign the pointer */
1383         cusparsestruct->mat = matstruct;
1384       } catch(char *ex) {
1385         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1386       }
1387       cusparsestruct->nonzerostate = A->nonzerostate;
1388     }
1389     err  = WaitForGPU();CHKERRCUDA(err);
1390     A->offloadmask = PETSC_OFFLOAD_BOTH;
1391     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1392   }
1393   PetscFunctionReturn(0);
1394 }
1395 
1396 struct VecCUDAPlusEquals
1397 {
1398   template <typename Tuple>
1399   __host__ __device__
1400   void operator()(Tuple t)
1401   {
1402     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1403   }
1404 };
1405 
1406 struct VecCUDAEqualsReverse
1407 {
1408   template <typename Tuple>
1409   __host__ __device__
1410   void operator()(Tuple t)
1411   {
1412     thrust::get<0>(t) = thrust::get<1>(t);
1413   }
1414 };
1415 
1416 typedef struct {
1417   PetscBool   cisdense;
1418   PetscScalar *Bt;
1419   Mat         X;
1420 } MatMatCusparse;
1421 
1422 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1423 {
1424   PetscErrorCode ierr;
1425   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1426   cudaError_t    cerr;
1427 
1428   PetscFunctionBegin;
1429   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1430   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1431   ierr = PetscFree(data);CHKERRQ(ierr);
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1436 
1437 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1438 {
1439   Mat_Product                  *product = C->product;
1440   Mat                          A,B;
1441   PetscInt                     m,n,k,blda,clda;
1442   PetscBool                    flg,biscuda;
1443   Mat_SeqAIJCUSPARSE           *cusp;
1444   cusparseStatus_t             stat;
1445   cusparseOperation_t          opA;
1446   cusparseMatDescr_t           matA;
1447   const PetscScalar            *barray;
1448   PetscScalar                  *carray;
1449   PetscErrorCode               ierr;
1450   MatMatCusparse               *mmdata;
1451   Mat_SeqAIJCUSPARSEMultStruct *mat;
1452   CsrMatrix                    *csrmat;
1453   cudaError_t                  cuer;
1454 
1455   PetscFunctionBegin;
1456   MatCheckProduct(C,1);
1457   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1458   mmdata = (MatMatCusparse*)product->data;
1459   A    = product->A;
1460   B    = product->B;
1461   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1462   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1463   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1464      Instead of silently accepting the wrong answer, I prefer to raise the error */
1465   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1466   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1467   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1468   switch (product->type) {
1469   case MATPRODUCT_AB:
1470   case MATPRODUCT_PtAP:
1471     mat = cusp->mat;
1472     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1473     m   = A->rmap->n;
1474     k   = A->cmap->n;
1475     n   = B->cmap->n;
1476     break;
1477   case MATPRODUCT_AtB:
1478     if (!cusp->transgen) {
1479       mat = cusp->mat;
1480       opA = CUSPARSE_OPERATION_TRANSPOSE;
1481     } else {
1482       if (!cusp->matTranspose) {
1483         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1484       }
1485       mat = cusp->matTranspose;
1486       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1487     }
1488     m = A->cmap->n;
1489     k = A->rmap->n;
1490     n = B->cmap->n;
1491     break;
1492   case MATPRODUCT_ABt:
1493   case MATPRODUCT_RARt:
1494     mat = cusp->mat;
1495     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1496     m   = A->rmap->n;
1497     k   = B->cmap->n;
1498     n   = B->rmap->n;
1499     break;
1500   default:
1501     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1502   }
1503   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1504   matA   = mat->descr;
1505   csrmat = (CsrMatrix*)mat->mat;
1506   /* if the user passed a CPU matrix, copy the data to the GPU */
1507   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1508   if (!biscuda) {
1509     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1510   }
1511   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1512   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1513   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1514     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1515     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1516   } else {
1517     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1518     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1519   }
1520 
1521   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1522 
1523   /* cusparse spmm does not support transpose on B */
1524   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1525     cublasHandle_t cublasv2handle;
1526     cublasStatus_t cerr;
1527 
1528     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1529     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1530                        B->cmap->n,B->rmap->n,
1531                        &PETSC_CUSPARSE_ONE ,barray,blda,
1532                        &PETSC_CUSPARSE_ZERO,barray,blda,
1533                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1534     blda = B->cmap->n;
1535   }
1536 
1537   /* perform the MatMat operation */
1538   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1539                            csrmat->num_entries,mat->alpha,matA,
1540                            csrmat->values->data().get(),
1541                            csrmat->row_offsets->data().get(),
1542                            csrmat->column_indices->data().get(),
1543                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1544                            carray,clda);CHKERRCUSPARSE(stat);
1545   cuer = WaitForGPU();CHKERRCUDA(cuer);
1546   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1547   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1548   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1549   if (product->type == MATPRODUCT_RARt) {
1550     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1551     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1552   } else if (product->type == MATPRODUCT_PtAP) {
1553     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1554     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1555   } else {
1556     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1557   }
1558   if (mmdata->cisdense) {
1559     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1560   }
1561   if (!biscuda) {
1562     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1563   }
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1568 {
1569   Mat_Product        *product = C->product;
1570   Mat                A,B;
1571   PetscInt           m,n;
1572   PetscBool          cisdense,flg;
1573   PetscErrorCode     ierr;
1574   MatMatCusparse     *mmdata;
1575   Mat_SeqAIJCUSPARSE *cusp;
1576 
1577   PetscFunctionBegin;
1578   MatCheckProduct(C,1);
1579   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1580   A    = product->A;
1581   B    = product->B;
1582   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1583   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1584   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1585   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1586   switch (product->type) {
1587   case MATPRODUCT_AB:
1588     m = A->rmap->n;
1589     n = B->cmap->n;
1590     break;
1591   case MATPRODUCT_AtB:
1592     m = A->cmap->n;
1593     n = B->cmap->n;
1594     break;
1595   case MATPRODUCT_ABt:
1596     m = A->rmap->n;
1597     n = B->rmap->n;
1598     break;
1599   case MATPRODUCT_PtAP:
1600     m = B->cmap->n;
1601     n = B->cmap->n;
1602     break;
1603   case MATPRODUCT_RARt:
1604     m = B->rmap->n;
1605     n = B->rmap->n;
1606     break;
1607   default:
1608     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1609   }
1610   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1611   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1612   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1613   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1614 
1615   /* product data */
1616   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1617   mmdata->cisdense = cisdense;
1618   /* cusparse spmm does not support transpose on B */
1619   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1620     cudaError_t cerr;
1621 
1622     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1623   }
1624   /* for these products we need intermediate storage */
1625   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1626     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1627     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1628     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1629       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1630     } else {
1631       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1632     }
1633   }
1634   C->product->data    = mmdata;
1635   C->product->destroy = MatDestroy_MatMatCusparse;
1636 
1637   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1642 
1643 /* handles dense B */
1644 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1645 {
1646   Mat_Product    *product = C->product;
1647   PetscErrorCode ierr;
1648 
1649   PetscFunctionBegin;
1650   MatCheckProduct(C,1);
1651   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1652   if (product->A->boundtocpu) {
1653     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1654     PetscFunctionReturn(0);
1655   }
1656   switch (product->type) {
1657   case MATPRODUCT_AB:
1658   case MATPRODUCT_AtB:
1659   case MATPRODUCT_ABt:
1660   case MATPRODUCT_PtAP:
1661   case MATPRODUCT_RARt:
1662     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1663   default:
1664     break;
1665   }
1666   PetscFunctionReturn(0);
1667 }
1668 
1669 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1670 {
1671   PetscErrorCode ierr;
1672 
1673   PetscFunctionBegin;
1674   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1675   PetscFunctionReturn(0);
1676 }
1677 
1678 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1679 {
1680   PetscErrorCode ierr;
1681 
1682   PetscFunctionBegin;
1683   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1684   PetscFunctionReturn(0);
1685 }
1686 
1687 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1688 {
1689   PetscErrorCode ierr;
1690 
1691   PetscFunctionBegin;
1692   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1693   PetscFunctionReturn(0);
1694 }
1695 
1696 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1697 {
1698   PetscErrorCode ierr;
1699 
1700   PetscFunctionBegin;
1701   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1706 {
1707   PetscErrorCode ierr;
1708 
1709   PetscFunctionBegin;
1710   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1711   PetscFunctionReturn(0);
1712 }
1713 
1714 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
1715 {
1716   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1717   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1718   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1719   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1720   PetscErrorCode               ierr;
1721   cudaError_t                  cerr;
1722   cusparseStatus_t             stat;
1723   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1724   PetscBool                    compressed;
1725 
1726   PetscFunctionBegin;
1727   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1728   if (!a->nonzerorowcnt) {
1729     if (!yy) {
1730       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1731     }
1732     PetscFunctionReturn(0);
1733   }
1734   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1735   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1736   if (!trans) {
1737     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1738   } else {
1739     if (herm || !cusparsestruct->transgen) {
1740       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1741       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1742     } else {
1743       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1744       if (!matstruct) {
1745         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1746         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1747       }
1748     }
1749   }
1750   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1751   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1752 
1753   try {
1754     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1755     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1756     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1757     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1758       xptr = xarray;
1759       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1760       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1761     } else {
1762       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1763       dptr = zarray;
1764       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1765 
1766       if (compressed) {
1767         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1768 
1769         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1770         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1771                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1772                          VecCUDAEqualsReverse());
1773         cerr = WaitForGPU();CHKERRCUDA(cerr);
1774         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1775       }
1776     }
1777 
1778     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1779     /* csr_spmv does y = alpha*Ax + beta*y */
1780     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1781       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1782       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1783                                mat->num_rows, mat->num_cols,
1784                                mat->num_entries, matstruct->alpha, matstruct->descr,
1785                                mat->values->data().get(), mat->row_offsets->data().get(),
1786                                mat->column_indices->data().get(), xptr, beta,
1787                                dptr);CHKERRCUSPARSE(stat);
1788     } else {
1789       if (cusparsestruct->nrows) {
1790         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1791         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1792                                  matstruct->alpha, matstruct->descr, hybMat,
1793                                  xptr, beta,
1794                                  dptr);CHKERRCUSPARSE(stat);
1795       }
1796     }
1797     cerr = WaitForGPU();CHKERRCUDA(cerr);
1798     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1799 
1800     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1801       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1802         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1803           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1804         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1805           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1806         }
1807       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1808         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1809       }
1810 
1811       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1812       if (compressed) {
1813         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1814 
1815         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1816         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1817                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1818                          VecCUDAPlusEquals());
1819         cerr = WaitForGPU();CHKERRCUDA(cerr);
1820         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1821       }
1822     } else {
1823       if (yy && yy != zz) {
1824         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1825       }
1826     }
1827     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1828     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1829     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1830   } catch(char *ex) {
1831     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1832   }
1833   if (yy) {
1834     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1835   } else {
1836     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1837   }
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1842 {
1843   PetscErrorCode ierr;
1844 
1845   PetscFunctionBegin;
1846   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1847   PetscFunctionReturn(0);
1848 }
1849 
1850 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1851 {
1852   PetscErrorCode ierr;
1853 
1854   PetscFunctionBegin;
1855   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1856   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1857   if (A->factortype == MAT_FACTOR_NONE) {
1858     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1859   }
1860   PetscFunctionReturn(0);
1861 }
1862 
1863 /* --------------------------------------------------------------------------------*/
1864 /*@
1865    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1866    (the default parallel PETSc format). This matrix will ultimately pushed down
1867    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1868    assembly performance the user should preallocate the matrix storage by setting
1869    the parameter nz (or the array nnz).  By setting these parameters accurately,
1870    performance during matrix assembly can be increased by more than a factor of 50.
1871 
1872    Collective
1873 
1874    Input Parameters:
1875 +  comm - MPI communicator, set to PETSC_COMM_SELF
1876 .  m - number of rows
1877 .  n - number of columns
1878 .  nz - number of nonzeros per row (same for all rows)
1879 -  nnz - array containing the number of nonzeros in the various rows
1880          (possibly different for each row) or NULL
1881 
1882    Output Parameter:
1883 .  A - the matrix
1884 
1885    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1886    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1887    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1888 
1889    Notes:
1890    If nnz is given then nz is ignored
1891 
1892    The AIJ format (also called the Yale sparse matrix format or
1893    compressed row storage), is fully compatible with standard Fortran 77
1894    storage.  That is, the stored row and column indices can begin at
1895    either one (as in Fortran) or zero.  See the users' manual for details.
1896 
1897    Specify the preallocated storage with either nz or nnz (not both).
1898    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1899    allocation.  For large problems you MUST preallocate memory or you
1900    will get TERRIBLE performance, see the users' manual chapter on matrices.
1901 
1902    By default, this format uses inodes (identical nodes) when possible, to
1903    improve numerical efficiency of matrix-vector products and solves. We
1904    search for consecutive rows with the same nonzero structure, thereby
1905    reusing matrix information to achieve increased efficiency.
1906 
1907    Level: intermediate
1908 
1909 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1910 @*/
1911 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1912 {
1913   PetscErrorCode ierr;
1914 
1915   PetscFunctionBegin;
1916   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1917   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1918   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1919   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1920   PetscFunctionReturn(0);
1921 }
1922 
1923 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1924 {
1925   PetscErrorCode ierr;
1926 
1927   PetscFunctionBegin;
1928   if (A->factortype == MAT_FACTOR_NONE) {
1929     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1930   } else {
1931     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1932   }
1933   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1934   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1935   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1936   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1937   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1938   PetscFunctionReturn(0);
1939 }
1940 
1941 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1942 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1943 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1944 {
1945   PetscErrorCode ierr;
1946 
1947   PetscFunctionBegin;
1948   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1949   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1954 {
1955   PetscErrorCode ierr;
1956 
1957   PetscFunctionBegin;
1958   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1959   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1960      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1961      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
1962      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1963            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1964   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1965   /* TODO: add support for this? */
1966   if (flg) {
1967     A->ops->mult                      = MatMult_SeqAIJ;
1968     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1969     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1970     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1971     A->ops->multhermitiantranspose    = NULL;
1972     A->ops->multhermitiantransposeadd = NULL;
1973     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1974     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1975   } else {
1976     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
1977     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
1978     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
1979     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1980     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
1981     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
1982     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
1983     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
1984   }
1985   A->boundtocpu = flg;
1986   PetscFunctionReturn(0);
1987 }
1988 
1989 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1990 {
1991   PetscErrorCode   ierr;
1992   cusparseStatus_t stat;
1993   Mat              B;
1994 
1995   PetscFunctionBegin;
1996   if (reuse == MAT_INITIAL_MATRIX) {
1997     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
1998   } else if (reuse == MAT_REUSE_MATRIX) {
1999     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2000   }
2001   B = *newmat;
2002 
2003   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2004   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2005 
2006   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2007     if (B->factortype == MAT_FACTOR_NONE) {
2008       Mat_SeqAIJCUSPARSE *spptr;
2009 
2010       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2011       spptr->format = MAT_CUSPARSE_CSR;
2012       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2013       B->spptr = spptr;
2014     } else {
2015       Mat_SeqAIJCUSPARSETriFactors *spptr;
2016 
2017       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2018       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2019       B->spptr = spptr;
2020     }
2021     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2022   }
2023   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2024   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2025   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2026   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2027   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2028 
2029   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2030   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2031   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2032   PetscFunctionReturn(0);
2033 }
2034 
2035 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2036 {
2037   PetscErrorCode ierr;
2038 
2039   PetscFunctionBegin;
2040   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2041   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2042   PetscFunctionReturn(0);
2043 }
2044 
2045 /*MC
2046    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2047 
2048    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2049    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2050    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2051 
2052    Options Database Keys:
2053 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2054 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2055 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2056 
2057   Level: beginner
2058 
2059 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2060 M*/
2061 
2062 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2063 
2064 
2065 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2066 {
2067   PetscErrorCode ierr;
2068 
2069   PetscFunctionBegin;
2070   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2071   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2072   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2073   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2074   PetscFunctionReturn(0);
2075 }
2076 
2077 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2078 {
2079   PetscErrorCode   ierr;
2080   cusparseStatus_t stat;
2081   cusparseHandle_t handle;
2082 
2083   PetscFunctionBegin;
2084   if (*cusparsestruct) {
2085     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2086     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2087     delete (*cusparsestruct)->workVector;
2088     delete (*cusparsestruct)->rowoffsets_gpu;
2089     if (handle = (*cusparsestruct)->handle) {
2090       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2091     }
2092     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2093   }
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2098 {
2099   PetscFunctionBegin;
2100   if (*mat) {
2101     delete (*mat)->values;
2102     delete (*mat)->column_indices;
2103     delete (*mat)->row_offsets;
2104     delete *mat;
2105     *mat = 0;
2106   }
2107   PetscFunctionReturn(0);
2108 }
2109 
2110 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2111 {
2112   cusparseStatus_t stat;
2113   PetscErrorCode   ierr;
2114 
2115   PetscFunctionBegin;
2116   if (*trifactor) {
2117     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2118     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2119     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2120     delete *trifactor;
2121     *trifactor = 0;
2122   }
2123   PetscFunctionReturn(0);
2124 }
2125 
2126 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2127 {
2128   CsrMatrix        *mat;
2129   cusparseStatus_t stat;
2130   cudaError_t      err;
2131 
2132   PetscFunctionBegin;
2133   if (*matstruct) {
2134     if ((*matstruct)->mat) {
2135       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2136         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2137         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2138       } else {
2139         mat = (CsrMatrix*)(*matstruct)->mat;
2140         CsrMatrix_Destroy(&mat);
2141       }
2142     }
2143     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2144     delete (*matstruct)->cprowIndices;
2145     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2146     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2147     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2148     delete *matstruct;
2149     *matstruct = 0;
2150   }
2151   PetscFunctionReturn(0);
2152 }
2153 
2154 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2155 {
2156   PetscErrorCode ierr;
2157 
2158   PetscFunctionBegin;
2159   if (*trifactors) {
2160     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2161     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2162     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2163     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2164     delete (*trifactors)->rpermIndices;
2165     delete (*trifactors)->cpermIndices;
2166     delete (*trifactors)->workVector;
2167     (*trifactors)->rpermIndices = 0;
2168     (*trifactors)->cpermIndices = 0;
2169     (*trifactors)->workVector = 0;
2170   }
2171   PetscFunctionReturn(0);
2172 }
2173 
2174 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2175 {
2176   PetscErrorCode   ierr;
2177   cusparseHandle_t handle;
2178   cusparseStatus_t stat;
2179 
2180   PetscFunctionBegin;
2181   if (*trifactors) {
2182     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2183     if (handle = (*trifactors)->handle) {
2184       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2185     }
2186     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2187   }
2188   PetscFunctionReturn(0);
2189 }
2190