xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision bef158480efac06de457f7a665168877ab3c2fd7)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
39 
40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
46 
47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48 {
49   cusparseStatus_t   stat;
50   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
51 
52   PetscFunctionBegin;
53   cusparsestruct->stream = stream;
54   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55   PetscFunctionReturn(0);
56 }
57 
58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59 {
60   cusparseStatus_t   stat;
61   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62 
63   PetscFunctionBegin;
64   if (cusparsestruct->handle != handle) {
65     if (cusparsestruct->handle) {
66       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
67     }
68     cusparsestruct->handle = handle;
69   }
70   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75 {
76   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
77 
78   PetscFunctionBegin;
79   if (cusparsestruct->handle) cusparsestruct->handle = 0;
80   PetscFunctionReturn(0);
81 }
82 
83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
111   (*B)->factortype = ftype;
112   (*B)->useordering = PETSC_TRUE;
113   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
114 
115   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
116     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
117     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
118     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
119   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
120     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
121     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
122   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
123 
124   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
125   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
126   PetscFunctionReturn(0);
127 }
128 
129 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
130 {
131   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
132 
133   PetscFunctionBegin;
134   switch (op) {
135   case MAT_CUSPARSE_MULT:
136     cusparsestruct->format = format;
137     break;
138   case MAT_CUSPARSE_ALL:
139     cusparsestruct->format = format;
140     break;
141   default:
142     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
143   }
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149    operation. Only the MatMult operation can use different GPU storage formats
150    for MPIAIJCUSPARSE matrices.
151    Not Collective
152 
153    Input Parameters:
154 +  A - Matrix of type SEQAIJCUSPARSE
155 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
157 
158    Output Parameter:
159 
160    Level: intermediate
161 
162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163 @*/
164 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165 {
166   PetscErrorCode ierr;
167 
168   PetscFunctionBegin;
169   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
170   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
171   PetscFunctionReturn(0);
172 }
173 
174 /*@
175    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
176 
177    Collective on mat
178 
179    Input Parameters:
180 +  A - Matrix of type SEQAIJCUSPARSE
181 -  transgen - the boolean flag
182 
183    Level: intermediate
184 
185 .seealso: MATSEQAIJCUSPARSE
186 @*/
187 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
188 {
189   PetscErrorCode ierr;
190   PetscBool      flg;
191 
192   PetscFunctionBegin;
193   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
194   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
195   if (flg) {
196     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
197 
198     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
199     cusp->transgen = transgen;
200     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
201       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
202     }
203   }
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
208 {
209   PetscErrorCode           ierr;
210   MatCUSPARSEStorageFormat format;
211   PetscBool                flg;
212   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
213 
214   PetscFunctionBegin;
215   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
216   if (A->factortype == MAT_FACTOR_NONE) {
217     PetscBool transgen = cusparsestruct->transgen;
218 
219     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
220     if (flg) {
221       ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);
222     }
223     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
224                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
225     if (flg) {
226       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
227     }
228   }
229   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
230                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
231   if (flg) {
232     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
233   }
234   ierr = PetscOptionsTail();CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
244   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
254   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
255   PetscFunctionReturn(0);
256 }
257 
258 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259 {
260   PetscErrorCode ierr;
261 
262   PetscFunctionBegin;
263   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
264   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265   PetscFunctionReturn(0);
266 }
267 
268 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
269 {
270   PetscErrorCode ierr;
271 
272   PetscFunctionBegin;
273   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
274   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
275   PetscFunctionReturn(0);
276 }
277 
278 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
279 {
280   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
281   PetscInt                          n = A->rmap->n;
282   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
283   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
284   cusparseStatus_t                  stat;
285   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
286   const MatScalar                   *aa = a->a,*v;
287   PetscInt                          *AiLo, *AjLo;
288   PetscScalar                       *AALo;
289   PetscInt                          i,nz, nzLower, offset, rowOffset;
290   PetscErrorCode                    ierr;
291   cudaError_t                       cerr;
292 
293   PetscFunctionBegin;
294   if (!n) PetscFunctionReturn(0);
295   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
296     try {
297       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
298       nzLower=n+ai[n]-ai[1];
299 
300       /* Allocate Space for the lower triangular matrix */
301       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
302       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
303       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
304 
305       /* Fill the lower triangular matrix */
306       AiLo[0]  = (PetscInt) 0;
307       AiLo[n]  = nzLower;
308       AjLo[0]  = (PetscInt) 0;
309       AALo[0]  = (MatScalar) 1.0;
310       v        = aa;
311       vi       = aj;
312       offset   = 1;
313       rowOffset= 1;
314       for (i=1; i<n; i++) {
315         nz = ai[i+1] - ai[i];
316         /* additional 1 for the term on the diagonal */
317         AiLo[i]    = rowOffset;
318         rowOffset += nz+1;
319 
320         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
321         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
322 
323         offset      += nz;
324         AjLo[offset] = (PetscInt) i;
325         AALo[offset] = (MatScalar) 1.0;
326         offset      += 1;
327 
328         v  += nz;
329         vi += nz;
330       }
331 
332       /* allocate space for the triangular factor information */
333       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
334 
335       /* Create the matrix description */
336       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
337       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
338       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
339       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
340       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
341 
342       /* Create the solve analysis information */
343       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
344 
345       /* set the operation */
346       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
347 
348       /* set the matrix */
349       loTriFactor->csrMat = new CsrMatrix;
350       loTriFactor->csrMat->num_rows = n;
351       loTriFactor->csrMat->num_cols = n;
352       loTriFactor->csrMat->num_entries = nzLower;
353 
354       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
355       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
356 
357       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
358       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
359 
360       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
361       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
362 
363       /* perform the solve analysis */
364       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
365                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
366                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
367                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
368 
369       /* assign the pointer. Is this really necessary? */
370       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
371 
372       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
373       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
374       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
375       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
376     } catch(char *ex) {
377       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
378     }
379   }
380   PetscFunctionReturn(0);
381 }
382 
383 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
384 {
385   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
386   PetscInt                          n = A->rmap->n;
387   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
388   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
389   cusparseStatus_t                  stat;
390   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
391   const MatScalar                   *aa = a->a,*v;
392   PetscInt                          *AiUp, *AjUp;
393   PetscScalar                       *AAUp;
394   PetscInt                          i,nz, nzUpper, offset;
395   PetscErrorCode                    ierr;
396   cudaError_t                       cerr;
397 
398   PetscFunctionBegin;
399   if (!n) PetscFunctionReturn(0);
400   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
401     try {
402       /* next, figure out the number of nonzeros in the upper triangular matrix. */
403       nzUpper = adiag[0]-adiag[n];
404 
405       /* Allocate Space for the upper triangular matrix */
406       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
407       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
408       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
409 
410       /* Fill the upper triangular matrix */
411       AiUp[0]=(PetscInt) 0;
412       AiUp[n]=nzUpper;
413       offset = nzUpper;
414       for (i=n-1; i>=0; i--) {
415         v  = aa + adiag[i+1] + 1;
416         vi = aj + adiag[i+1] + 1;
417 
418         /* number of elements NOT on the diagonal */
419         nz = adiag[i] - adiag[i+1]-1;
420 
421         /* decrement the offset */
422         offset -= (nz+1);
423 
424         /* first, set the diagonal elements */
425         AjUp[offset] = (PetscInt) i;
426         AAUp[offset] = (MatScalar)1./v[nz];
427         AiUp[i]      = AiUp[i+1] - (nz+1);
428 
429         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
430         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
431       }
432 
433       /* allocate space for the triangular factor information */
434       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
435 
436       /* Create the matrix description */
437       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
438       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
439       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
440       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
441       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
442 
443       /* Create the solve analysis information */
444       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
445 
446       /* set the operation */
447       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
448 
449       /* set the matrix */
450       upTriFactor->csrMat = new CsrMatrix;
451       upTriFactor->csrMat->num_rows = n;
452       upTriFactor->csrMat->num_cols = n;
453       upTriFactor->csrMat->num_entries = nzUpper;
454 
455       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
456       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
457 
458       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
459       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
460 
461       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
462       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
463 
464       /* perform the solve analysis */
465       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
466                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
467                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
468                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
469 
470       /* assign the pointer. Is this really necessary? */
471       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
472 
473       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
474       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
475       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
476       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
477     } catch(char *ex) {
478       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
479     }
480   }
481   PetscFunctionReturn(0);
482 }
483 
484 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
485 {
486   PetscErrorCode               ierr;
487   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
488   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
489   IS                           isrow = a->row,iscol = a->icol;
490   PetscBool                    row_identity,col_identity;
491   const PetscInt               *r,*c;
492   PetscInt                     n = A->rmap->n;
493 
494   PetscFunctionBegin;
495   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
496   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
497   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
498 
499   cusparseTriFactors->workVector = new THRUSTARRAY(n);
500   cusparseTriFactors->nnz=a->nz;
501 
502   A->offloadmask = PETSC_OFFLOAD_BOTH;
503   /* lower triangular indices */
504   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
505   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
506   if (!row_identity) {
507     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
508     cusparseTriFactors->rpermIndices->assign(r, r+n);
509   }
510   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
511 
512   /* upper triangular indices */
513   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
514   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
515   if (!col_identity) {
516     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
517     cusparseTriFactors->cpermIndices->assign(c, c+n);
518   }
519 
520   if (!row_identity && !col_identity) {
521     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
522   } else if(!row_identity) {
523     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
524   } else if(!col_identity) {
525     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
526   }
527 
528   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
529   PetscFunctionReturn(0);
530 }
531 
532 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
533 {
534   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
535   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
536   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
537   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
538   cusparseStatus_t                  stat;
539   PetscErrorCode                    ierr;
540   cudaError_t                       cerr;
541   PetscInt                          *AiUp, *AjUp;
542   PetscScalar                       *AAUp;
543   PetscScalar                       *AALo;
544   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
545   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
546   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
547   const MatScalar                   *aa = b->a,*v;
548 
549   PetscFunctionBegin;
550   if (!n) PetscFunctionReturn(0);
551   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
552     try {
553       /* Allocate Space for the upper triangular matrix */
554       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
555       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
556       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
557       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
558 
559       /* Fill the upper triangular matrix */
560       AiUp[0]=(PetscInt) 0;
561       AiUp[n]=nzUpper;
562       offset = 0;
563       for (i=0; i<n; i++) {
564         /* set the pointers */
565         v  = aa + ai[i];
566         vj = aj + ai[i];
567         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
568 
569         /* first, set the diagonal elements */
570         AjUp[offset] = (PetscInt) i;
571         AAUp[offset] = (MatScalar)1.0/v[nz];
572         AiUp[i]      = offset;
573         AALo[offset] = (MatScalar)1.0/v[nz];
574 
575         offset+=1;
576         if (nz>0) {
577           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
578           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
579           for (j=offset; j<offset+nz; j++) {
580             AAUp[j] = -AAUp[j];
581             AALo[j] = AAUp[j]/v[nz];
582           }
583           offset+=nz;
584         }
585       }
586 
587       /* allocate space for the triangular factor information */
588       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
589 
590       /* Create the matrix description */
591       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
592       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
593       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
594       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
595       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
596 
597       /* Create the solve analysis information */
598       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599 
600       /* set the operation */
601       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
602 
603       /* set the matrix */
604       upTriFactor->csrMat = new CsrMatrix;
605       upTriFactor->csrMat->num_rows = A->rmap->n;
606       upTriFactor->csrMat->num_cols = A->cmap->n;
607       upTriFactor->csrMat->num_entries = a->nz;
608 
609       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
610       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
611 
612       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
613       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
614 
615       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
616       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
617 
618       /* perform the solve analysis */
619       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
620                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
621                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
622                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
623 
624       /* assign the pointer. Is this really necessary? */
625       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
626 
627       /* allocate space for the triangular factor information */
628       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
629 
630       /* Create the matrix description */
631       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
632       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
633       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
634       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
635       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
636 
637       /* Create the solve analysis information */
638       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
639 
640       /* set the operation */
641       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
642 
643       /* set the matrix */
644       loTriFactor->csrMat = new CsrMatrix;
645       loTriFactor->csrMat->num_rows = A->rmap->n;
646       loTriFactor->csrMat->num_cols = A->cmap->n;
647       loTriFactor->csrMat->num_entries = a->nz;
648 
649       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
650       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
651 
652       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
653       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
654 
655       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
656       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
657       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
658 
659       /* perform the solve analysis */
660       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
661                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
662                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
663                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
664 
665       /* assign the pointer. Is this really necessary? */
666       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
667 
668       A->offloadmask = PETSC_OFFLOAD_BOTH;
669       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
670       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
671       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
672       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
673     } catch(char *ex) {
674       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
675     }
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
681 {
682   PetscErrorCode               ierr;
683   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
684   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
685   IS                           ip = a->row;
686   const PetscInt               *rip;
687   PetscBool                    perm_identity;
688   PetscInt                     n = A->rmap->n;
689 
690   PetscFunctionBegin;
691   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
692   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
693   cusparseTriFactors->workVector = new THRUSTARRAY(n);
694   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
695 
696   /* lower triangular indices */
697   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
698   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
699   if (!perm_identity) {
700     IS             iip;
701     const PetscInt *irip;
702 
703     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
704     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
705     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
706     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
707     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
708     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
709     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
710     ierr = ISDestroy(&iip);CHKERRQ(ierr);
711     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
712   }
713   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
714   PetscFunctionReturn(0);
715 }
716 
717 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
718 {
719   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
720   IS             isrow = b->row,iscol = b->col;
721   PetscBool      row_identity,col_identity;
722   PetscErrorCode ierr;
723 
724   PetscFunctionBegin;
725   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
726   B->offloadmask = PETSC_OFFLOAD_CPU;
727   /* determine which version of MatSolve needs to be used. */
728   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
729   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
730   if (row_identity && col_identity) {
731     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
732     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
733     B->ops->matsolve = NULL;
734     B->ops->matsolvetranspose = NULL;
735   } else {
736     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
737     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
738     B->ops->matsolve = NULL;
739     B->ops->matsolvetranspose = NULL;
740   }
741 
742   /* get the triangular factors */
743   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
744   PetscFunctionReturn(0);
745 }
746 
747 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
748 {
749   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
750   IS             ip = b->row;
751   PetscBool      perm_identity;
752   PetscErrorCode ierr;
753 
754   PetscFunctionBegin;
755   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
756   B->offloadmask = PETSC_OFFLOAD_CPU;
757   /* determine which version of MatSolve needs to be used. */
758   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
759   if (perm_identity) {
760     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
761     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
762     B->ops->matsolve = NULL;
763     B->ops->matsolvetranspose = NULL;
764   } else {
765     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
766     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
767     B->ops->matsolve = NULL;
768     B->ops->matsolvetranspose = NULL;
769   }
770 
771   /* get the triangular factors */
772   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
777 {
778   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
779   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
780   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
781   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
782   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
783   cusparseStatus_t                  stat;
784   cusparseIndexBase_t               indexBase;
785   cusparseMatrixType_t              matrixType;
786   cusparseFillMode_t                fillMode;
787   cusparseDiagType_t                diagType;
788 
789   PetscFunctionBegin;
790 
791   /*********************************************/
792   /* Now the Transpose of the Lower Tri Factor */
793   /*********************************************/
794 
795   /* allocate space for the transpose of the lower triangular factor */
796   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
797 
798   /* set the matrix descriptors of the lower triangular factor */
799   matrixType = cusparseGetMatType(loTriFactor->descr);
800   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
801   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
802     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
803   diagType = cusparseGetMatDiagType(loTriFactor->descr);
804 
805   /* Create the matrix description */
806   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
807   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
808   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
809   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
810   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
811 
812   /* Create the solve analysis information */
813   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
814 
815   /* set the operation */
816   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
817 
818   /* allocate GPU space for the CSC of the lower triangular factor*/
819   loTriFactorT->csrMat = new CsrMatrix;
820   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
821   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
822   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
823   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
824   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
825   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
826 
827   /* compute the transpose of the lower triangular factor, i.e. the CSC */
828   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
829                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
830                           loTriFactor->csrMat->values->data().get(),
831                           loTriFactor->csrMat->row_offsets->data().get(),
832                           loTriFactor->csrMat->column_indices->data().get(),
833                           loTriFactorT->csrMat->values->data().get(),
834                           loTriFactorT->csrMat->column_indices->data().get(),
835                           loTriFactorT->csrMat->row_offsets->data().get(),
836                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
837 
838   /* perform the solve analysis on the transposed matrix */
839   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
840                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
841                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
842                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
843                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
844 
845   /* assign the pointer. Is this really necessary? */
846   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
847 
848   /*********************************************/
849   /* Now the Transpose of the Upper Tri Factor */
850   /*********************************************/
851 
852   /* allocate space for the transpose of the upper triangular factor */
853   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
854 
855   /* set the matrix descriptors of the upper triangular factor */
856   matrixType = cusparseGetMatType(upTriFactor->descr);
857   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
858   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
859     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
860   diagType = cusparseGetMatDiagType(upTriFactor->descr);
861 
862   /* Create the matrix description */
863   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
864   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
865   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
866   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
867   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
868 
869   /* Create the solve analysis information */
870   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
871 
872   /* set the operation */
873   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
874 
875   /* allocate GPU space for the CSC of the upper triangular factor*/
876   upTriFactorT->csrMat = new CsrMatrix;
877   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
878   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
879   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
880   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
881   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
882   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
883 
884   /* compute the transpose of the upper triangular factor, i.e. the CSC */
885   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
886                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
887                           upTriFactor->csrMat->values->data().get(),
888                           upTriFactor->csrMat->row_offsets->data().get(),
889                           upTriFactor->csrMat->column_indices->data().get(),
890                           upTriFactorT->csrMat->values->data().get(),
891                           upTriFactorT->csrMat->column_indices->data().get(),
892                           upTriFactorT->csrMat->row_offsets->data().get(),
893                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
894 
895   /* perform the solve analysis on the transposed matrix */
896   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
897                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
898                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
899                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
900                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
901 
902   /* assign the pointer. Is this really necessary? */
903   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
904   PetscFunctionReturn(0);
905 }
906 
907 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
908 {
909   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
910   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
911   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
912   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
913   cusparseStatus_t             stat;
914   cusparseIndexBase_t          indexBase;
915   cudaError_t                  err;
916   PetscErrorCode               ierr;
917 
918   PetscFunctionBegin;
919   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
920   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
921   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
922   /* create cusparse matrix */
923   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
924   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
925   indexBase = cusparseGetMatIndexBase(matstruct->descr);
926   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
927   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
928 
929   /* set alpha and beta */
930   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
931   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
932   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
933   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
934   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
935   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
936   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
937 
938   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
939     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
940     CsrMatrix *matrixT= new CsrMatrix;
941     matrixT->num_rows = A->cmap->n;
942     matrixT->num_cols = A->rmap->n;
943     matrixT->num_entries = a->nz;
944     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
945     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
946     matrixT->values = new THRUSTARRAY(a->nz);
947 
948     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
949     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
950     /* compute the transpose, i.e. the CSC */
951     indexBase = cusparseGetMatIndexBase(matstruct->descr);
952     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
953                             A->cmap->n, matrix->num_entries,
954                             matrix->values->data().get(),
955                             cusparsestruct->rowoffsets_gpu->data().get(),
956                             matrix->column_indices->data().get(),
957                             matrixT->values->data().get(),
958                             matrixT->column_indices->data().get(),
959                             matrixT->row_offsets->data().get(),
960                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
961     matstructT->mat = matrixT;
962   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
963     CsrMatrix *temp = new CsrMatrix;
964     CsrMatrix *tempT = new CsrMatrix;
965 
966     /* First convert HYB to CSR */
967     temp->num_rows = A->rmap->n;
968     temp->num_cols = A->cmap->n;
969     temp->num_entries = a->nz;
970     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
971     temp->column_indices = new THRUSTINTARRAY32(a->nz);
972     temp->values = new THRUSTARRAY(a->nz);
973 
974     stat = cusparse_hyb2csr(cusparsestruct->handle,
975                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
976                             temp->values->data().get(),
977                             temp->row_offsets->data().get(),
978                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
979 
980     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
981     tempT->num_rows = A->rmap->n;
982     tempT->num_cols = A->cmap->n;
983     tempT->num_entries = a->nz;
984     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
985     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
986     tempT->values = new THRUSTARRAY(a->nz);
987 
988     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
989                             temp->num_cols, temp->num_entries,
990                             temp->values->data().get(),
991                             temp->row_offsets->data().get(),
992                             temp->column_indices->data().get(),
993                             tempT->values->data().get(),
994                             tempT->column_indices->data().get(),
995                             tempT->row_offsets->data().get(),
996                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
997 
998     /* Last, convert CSC to HYB */
999     cusparseHybMat_t hybMat;
1000     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1001     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1002       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1003     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1004                             matstructT->descr, tempT->values->data().get(),
1005                             tempT->row_offsets->data().get(),
1006                             tempT->column_indices->data().get(),
1007                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1008 
1009     /* assign the pointer */
1010     matstructT->mat = hybMat;
1011     /* delete temporaries */
1012     if (tempT) {
1013       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1014       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1015       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1016       delete (CsrMatrix*) tempT;
1017     }
1018     if (temp) {
1019       if (temp->values) delete (THRUSTARRAY*) temp->values;
1020       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1021       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1022       delete (CsrMatrix*) temp;
1023     }
1024   }
1025   err  = WaitForCUDA();CHKERRCUDA(err);
1026   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1027   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1028   /* the compressed row indices is not used for matTranspose */
1029   matstructT->cprowIndices = NULL;
1030   /* assign the pointer */
1031   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1036 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1037 {
1038   PetscInt                              n = xx->map->n;
1039   const PetscScalar                     *barray;
1040   PetscScalar                           *xarray;
1041   thrust::device_ptr<const PetscScalar> bGPU;
1042   thrust::device_ptr<PetscScalar>       xGPU;
1043   cusparseStatus_t                      stat;
1044   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1045   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1046   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1047   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1048   PetscErrorCode                        ierr;
1049   cudaError_t                           cerr;
1050 
1051   PetscFunctionBegin;
1052   /* Analyze the matrix and create the transpose ... on the fly */
1053   if (!loTriFactorT && !upTriFactorT) {
1054     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1055     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1056     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1057   }
1058 
1059   /* Get the GPU pointers */
1060   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1061   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1062   xGPU = thrust::device_pointer_cast(xarray);
1063   bGPU = thrust::device_pointer_cast(barray);
1064 
1065   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1066   /* First, reorder with the row permutation */
1067   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1068                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1069                xGPU);
1070 
1071   /* First, solve U */
1072   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1073                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1074                         upTriFactorT->csrMat->values->data().get(),
1075                         upTriFactorT->csrMat->row_offsets->data().get(),
1076                         upTriFactorT->csrMat->column_indices->data().get(),
1077                         upTriFactorT->solveInfo,
1078                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1079 
1080   /* Then, solve L */
1081   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1083                         loTriFactorT->csrMat->values->data().get(),
1084                         loTriFactorT->csrMat->row_offsets->data().get(),
1085                         loTriFactorT->csrMat->column_indices->data().get(),
1086                         loTriFactorT->solveInfo,
1087                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1088 
1089   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1090   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1091                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1092                tempGPU->begin());
1093 
1094   /* Copy the temporary to the full solution. */
1095   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1096 
1097   /* restore */
1098   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1099   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1100   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1101   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1102   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1107 {
1108   const PetscScalar                 *barray;
1109   PetscScalar                       *xarray;
1110   cusparseStatus_t                  stat;
1111   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1112   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1113   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1114   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1115   PetscErrorCode                    ierr;
1116   cudaError_t                       cerr;
1117 
1118   PetscFunctionBegin;
1119   /* Analyze the matrix and create the transpose ... on the fly */
1120   if (!loTriFactorT && !upTriFactorT) {
1121     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1122     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1123     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1124   }
1125 
1126   /* Get the GPU pointers */
1127   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1128   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1129 
1130   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1131   /* First, solve U */
1132   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1133                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1134                         upTriFactorT->csrMat->values->data().get(),
1135                         upTriFactorT->csrMat->row_offsets->data().get(),
1136                         upTriFactorT->csrMat->column_indices->data().get(),
1137                         upTriFactorT->solveInfo,
1138                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1139 
1140   /* Then, solve L */
1141   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1142                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1143                         loTriFactorT->csrMat->values->data().get(),
1144                         loTriFactorT->csrMat->row_offsets->data().get(),
1145                         loTriFactorT->csrMat->column_indices->data().get(),
1146                         loTriFactorT->solveInfo,
1147                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1148 
1149   /* restore */
1150   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1151   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1152   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1153   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1154   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1155   PetscFunctionReturn(0);
1156 }
1157 
1158 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1159 {
1160   const PetscScalar                     *barray;
1161   PetscScalar                           *xarray;
1162   thrust::device_ptr<const PetscScalar> bGPU;
1163   thrust::device_ptr<PetscScalar>       xGPU;
1164   cusparseStatus_t                      stat;
1165   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1166   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1167   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1168   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1169   PetscErrorCode                        ierr;
1170   cudaError_t                           cerr;
1171 
1172   PetscFunctionBegin;
1173 
1174   /* Get the GPU pointers */
1175   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1176   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1177   xGPU = thrust::device_pointer_cast(xarray);
1178   bGPU = thrust::device_pointer_cast(barray);
1179 
1180   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1181   /* First, reorder with the row permutation */
1182   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1183                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1184                tempGPU->begin());
1185 
1186   /* Next, solve L */
1187   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1188                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1189                         loTriFactor->csrMat->values->data().get(),
1190                         loTriFactor->csrMat->row_offsets->data().get(),
1191                         loTriFactor->csrMat->column_indices->data().get(),
1192                         loTriFactor->solveInfo,
1193                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1194 
1195   /* Then, solve U */
1196   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1197                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1198                         upTriFactor->csrMat->values->data().get(),
1199                         upTriFactor->csrMat->row_offsets->data().get(),
1200                         upTriFactor->csrMat->column_indices->data().get(),
1201                         upTriFactor->solveInfo,
1202                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1203 
1204   /* Last, reorder with the column permutation */
1205   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1206                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1207                xGPU);
1208 
1209   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1210   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1211   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1212   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1213   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1218 {
1219   const PetscScalar                 *barray;
1220   PetscScalar                       *xarray;
1221   cusparseStatus_t                  stat;
1222   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1223   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1224   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1225   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1226   PetscErrorCode                    ierr;
1227   cudaError_t                       cerr;
1228 
1229   PetscFunctionBegin;
1230   /* Get the GPU pointers */
1231   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1232   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1233 
1234   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1235   /* First, solve L */
1236   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1237                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1238                         loTriFactor->csrMat->values->data().get(),
1239                         loTriFactor->csrMat->row_offsets->data().get(),
1240                         loTriFactor->csrMat->column_indices->data().get(),
1241                         loTriFactor->solveInfo,
1242                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1243 
1244   /* Next, solve U */
1245   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1246                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1247                         upTriFactor->csrMat->values->data().get(),
1248                         upTriFactor->csrMat->row_offsets->data().get(),
1249                         upTriFactor->csrMat->column_indices->data().get(),
1250                         upTriFactor->solveInfo,
1251                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1252 
1253   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1254   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1255   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1256   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1257   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1258   PetscFunctionReturn(0);
1259 }
1260 
1261 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1262 {
1263   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1264   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1265   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1266   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1267   PetscErrorCode               ierr;
1268   cusparseStatus_t             stat;
1269   cudaError_t                  err;
1270 
1271   PetscFunctionBegin;
1272   if (A->boundtocpu) PetscFunctionReturn(0);
1273   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1274     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1275       /* Copy values only */
1276       CsrMatrix *mat,*matT;
1277       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1278 
1279       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1280       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1281       mat->values->assign(a->a, a->a+a->nz);
1282       err  = WaitForCUDA();CHKERRCUDA(err);
1283       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1284       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1285       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1286 
1287       /* Update matT when it was built before */
1288       if (cusparsestruct->matTranspose) {
1289         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1290         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1291         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1292         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1293         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1294                                  A->cmap->n, mat->num_entries,
1295                                  mat->values->data().get(),
1296                                  cusparsestruct->rowoffsets_gpu->data().get(),
1297                                  mat->column_indices->data().get(),
1298                                  matT->values->data().get(),
1299                                  matT->column_indices->data().get(),
1300                                  matT->row_offsets->data().get(),
1301                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1302         err  = WaitForCUDA();CHKERRCUDA(err);
1303         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1304         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1305       }
1306     } else {
1307       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1308       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1309       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1310       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1311       delete cusparsestruct->workVector;
1312       delete cusparsestruct->rowoffsets_gpu;
1313       try {
1314         if (a->compressedrow.use) {
1315           m    = a->compressedrow.nrows;
1316           ii   = a->compressedrow.i;
1317           ridx = a->compressedrow.rindex;
1318         } else {
1319           m    = A->rmap->n;
1320           ii   = a->i;
1321           ridx = NULL;
1322         }
1323         cusparsestruct->nrows = m;
1324 
1325         /* create cusparse matrix */
1326         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1327         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1328         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1329         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1330 
1331         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1332         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1333         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1334         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1335         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1336         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1337         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1338 
1339         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1340         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1341           /* set the matrix */
1342           CsrMatrix *matrix= new CsrMatrix;
1343           matrix->num_rows = m;
1344           matrix->num_cols = A->cmap->n;
1345           matrix->num_entries = a->nz;
1346           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1347           matrix->row_offsets->assign(ii, ii + m+1);
1348 
1349           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1350           matrix->column_indices->assign(a->j, a->j+a->nz);
1351 
1352           matrix->values = new THRUSTARRAY(a->nz);
1353           matrix->values->assign(a->a, a->a+a->nz);
1354 
1355           /* assign the pointer */
1356           matstruct->mat = matrix;
1357 
1358         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1359           CsrMatrix *matrix= new CsrMatrix;
1360           matrix->num_rows = m;
1361           matrix->num_cols = A->cmap->n;
1362           matrix->num_entries = a->nz;
1363           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1364           matrix->row_offsets->assign(ii, ii + m+1);
1365 
1366           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1367           matrix->column_indices->assign(a->j, a->j+a->nz);
1368 
1369           matrix->values = new THRUSTARRAY(a->nz);
1370           matrix->values->assign(a->a, a->a+a->nz);
1371 
1372           cusparseHybMat_t hybMat;
1373           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1374           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1375             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1376           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1377               matstruct->descr, matrix->values->data().get(),
1378               matrix->row_offsets->data().get(),
1379               matrix->column_indices->data().get(),
1380               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1381           /* assign the pointer */
1382           matstruct->mat = hybMat;
1383 
1384           if (matrix) {
1385             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1386             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1387             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1388             delete (CsrMatrix*)matrix;
1389           }
1390         }
1391 
1392         /* assign the compressed row indices */
1393         if (a->compressedrow.use) {
1394           cusparsestruct->workVector = new THRUSTARRAY(m);
1395           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1396           matstruct->cprowIndices->assign(ridx,ridx+m);
1397           tmp = m;
1398         } else {
1399           cusparsestruct->workVector = NULL;
1400           matstruct->cprowIndices    = NULL;
1401           tmp = 0;
1402         }
1403         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1404 
1405         /* assign the pointer */
1406         cusparsestruct->mat = matstruct;
1407       } catch(char *ex) {
1408         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1409       }
1410       err  = WaitForCUDA();CHKERRCUDA(err);
1411       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1412       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1413       cusparsestruct->nonzerostate = A->nonzerostate;
1414     }
1415     A->offloadmask = PETSC_OFFLOAD_BOTH;
1416   }
1417   PetscFunctionReturn(0);
1418 }
1419 
1420 struct VecCUDAPlusEquals
1421 {
1422   template <typename Tuple>
1423   __host__ __device__
1424   void operator()(Tuple t)
1425   {
1426     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1427   }
1428 };
1429 
1430 struct VecCUDAEqualsReverse
1431 {
1432   template <typename Tuple>
1433   __host__ __device__
1434   void operator()(Tuple t)
1435   {
1436     thrust::get<0>(t) = thrust::get<1>(t);
1437   }
1438 };
1439 
1440 typedef struct {
1441   PetscBool   cisdense;
1442   PetscScalar *Bt;
1443   Mat         X;
1444 } MatMatCusparse;
1445 
1446 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1447 {
1448   PetscErrorCode ierr;
1449   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1450   cudaError_t    cerr;
1451 
1452   PetscFunctionBegin;
1453   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1454   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1455   ierr = PetscFree(data);CHKERRQ(ierr);
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1460 
1461 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1462 {
1463   Mat_Product                  *product = C->product;
1464   Mat                          A,B;
1465   PetscInt                     m,n,k,blda,clda;
1466   PetscBool                    flg,biscuda;
1467   Mat_SeqAIJCUSPARSE           *cusp;
1468   cusparseStatus_t             stat;
1469   cusparseOperation_t          opA;
1470   cusparseMatDescr_t           matA;
1471   const PetscScalar            *barray;
1472   PetscScalar                  *carray;
1473   PetscErrorCode               ierr;
1474   MatMatCusparse               *mmdata;
1475   Mat_SeqAIJCUSPARSEMultStruct *mat;
1476   CsrMatrix                    *csrmat;
1477   cudaError_t                  cuer;
1478 
1479   PetscFunctionBegin;
1480   MatCheckProduct(C,1);
1481   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1482   mmdata = (MatMatCusparse*)product->data;
1483   A    = product->A;
1484   B    = product->B;
1485   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1486   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1487   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1488      Instead of silently accepting the wrong answer, I prefer to raise the error */
1489   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1490   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1491   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1492   switch (product->type) {
1493   case MATPRODUCT_AB:
1494   case MATPRODUCT_PtAP:
1495     mat = cusp->mat;
1496     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1497     m   = A->rmap->n;
1498     k   = A->cmap->n;
1499     n   = B->cmap->n;
1500     break;
1501   case MATPRODUCT_AtB:
1502     if (!cusp->transgen) {
1503       mat = cusp->mat;
1504       opA = CUSPARSE_OPERATION_TRANSPOSE;
1505     } else {
1506       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1507       mat  = cusp->matTranspose;
1508       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1509     }
1510     m = A->cmap->n;
1511     k = A->rmap->n;
1512     n = B->cmap->n;
1513     break;
1514   case MATPRODUCT_ABt:
1515   case MATPRODUCT_RARt:
1516     mat = cusp->mat;
1517     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1518     m   = A->rmap->n;
1519     k   = B->cmap->n;
1520     n   = B->rmap->n;
1521     break;
1522   default:
1523     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1524   }
1525   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1526   matA   = mat->descr;
1527   csrmat = (CsrMatrix*)mat->mat;
1528   /* if the user passed a CPU matrix, copy the data to the GPU */
1529   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1530   if (!biscuda) {
1531     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1532   }
1533   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1534   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1535   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1536     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1537     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1538   } else {
1539     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1540     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1541   }
1542 
1543   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1544 
1545   /* cusparse spmm does not support transpose on B */
1546   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1547     cublasHandle_t cublasv2handle;
1548     cublasStatus_t cerr;
1549 
1550     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1551     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1552                        B->cmap->n,B->rmap->n,
1553                        &PETSC_CUSPARSE_ONE ,barray,blda,
1554                        &PETSC_CUSPARSE_ZERO,barray,blda,
1555                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1556     blda = B->cmap->n;
1557   }
1558 
1559   /* perform the MatMat operation */
1560   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1561                            csrmat->num_entries,mat->alpha,matA,
1562                            csrmat->values->data().get(),
1563                            csrmat->row_offsets->data().get(),
1564                            csrmat->column_indices->data().get(),
1565                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1566                            carray,clda);CHKERRCUSPARSE(stat);
1567   cuer = WaitForCUDA();CHKERRCUDA(cuer);
1568   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1570   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1571   if (product->type == MATPRODUCT_RARt) {
1572     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1573     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1574   } else if (product->type == MATPRODUCT_PtAP) {
1575     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1576     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1577   } else {
1578     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1579   }
1580   if (mmdata->cisdense) {
1581     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1582   }
1583   if (!biscuda) {
1584     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1585   }
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1590 {
1591   Mat_Product        *product = C->product;
1592   Mat                A,B;
1593   PetscInt           m,n;
1594   PetscBool          cisdense,flg;
1595   PetscErrorCode     ierr;
1596   MatMatCusparse     *mmdata;
1597   Mat_SeqAIJCUSPARSE *cusp;
1598 
1599   PetscFunctionBegin;
1600   MatCheckProduct(C,1);
1601   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1602   A    = product->A;
1603   B    = product->B;
1604   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1605   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1606   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1607   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1608   switch (product->type) {
1609   case MATPRODUCT_AB:
1610     m = A->rmap->n;
1611     n = B->cmap->n;
1612     break;
1613   case MATPRODUCT_AtB:
1614     m = A->cmap->n;
1615     n = B->cmap->n;
1616     break;
1617   case MATPRODUCT_ABt:
1618     m = A->rmap->n;
1619     n = B->rmap->n;
1620     break;
1621   case MATPRODUCT_PtAP:
1622     m = B->cmap->n;
1623     n = B->cmap->n;
1624     break;
1625   case MATPRODUCT_RARt:
1626     m = B->rmap->n;
1627     n = B->rmap->n;
1628     break;
1629   default:
1630     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1631   }
1632   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1633   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1634   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1635   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1636 
1637   /* product data */
1638   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1639   mmdata->cisdense = cisdense;
1640   /* cusparse spmm does not support transpose on B */
1641   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1642     cudaError_t cerr;
1643 
1644     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1645   }
1646   /* for these products we need intermediate storage */
1647   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1648     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1649     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1650     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1651       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1652     } else {
1653       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1654     }
1655   }
1656   C->product->data    = mmdata;
1657   C->product->destroy = MatDestroy_MatMatCusparse;
1658 
1659   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1660   PetscFunctionReturn(0);
1661 }
1662 
1663 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1664 
1665 /* handles dense B */
1666 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1667 {
1668   Mat_Product    *product = C->product;
1669   PetscErrorCode ierr;
1670 
1671   PetscFunctionBegin;
1672   MatCheckProduct(C,1);
1673   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1674   if (product->A->boundtocpu) {
1675     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1676     PetscFunctionReturn(0);
1677   }
1678   switch (product->type) {
1679   case MATPRODUCT_AB:
1680   case MATPRODUCT_AtB:
1681   case MATPRODUCT_ABt:
1682   case MATPRODUCT_PtAP:
1683   case MATPRODUCT_RARt:
1684     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1685   default:
1686     break;
1687   }
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1692 {
1693   PetscErrorCode ierr;
1694 
1695   PetscFunctionBegin;
1696   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1697   PetscFunctionReturn(0);
1698 }
1699 
1700 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1701 {
1702   PetscErrorCode ierr;
1703 
1704   PetscFunctionBegin;
1705   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1706   PetscFunctionReturn(0);
1707 }
1708 
1709 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1710 {
1711   PetscErrorCode ierr;
1712 
1713   PetscFunctionBegin;
1714   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1715   PetscFunctionReturn(0);
1716 }
1717 
1718 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1719 {
1720   PetscErrorCode ierr;
1721 
1722   PetscFunctionBegin;
1723   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1724   PetscFunctionReturn(0);
1725 }
1726 
1727 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1728 {
1729   PetscErrorCode ierr;
1730 
1731   PetscFunctionBegin;
1732   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
1737 {
1738   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1739   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1740   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1741   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1742   PetscErrorCode               ierr;
1743   cudaError_t                  cerr;
1744   cusparseStatus_t             stat;
1745   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1746   PetscBool                    compressed;
1747 
1748   PetscFunctionBegin;
1749   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1750   if (!a->nonzerorowcnt) {
1751     if (!yy) {
1752       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1753     }
1754     PetscFunctionReturn(0);
1755   }
1756   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1757   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1758   if (!trans) {
1759     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1760   } else {
1761     if (herm || !cusparsestruct->transgen) {
1762       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1763       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1764     } else {
1765       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1766       if (!matstruct) {
1767         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1768         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1769       }
1770     }
1771   }
1772   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1773   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1774 
1775   try {
1776     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1777     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1778     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1779     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1780     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1781       xptr = xarray;
1782       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1783       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1784     } else {
1785       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1786       dptr = zarray;
1787       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1788 
1789       if (compressed) {
1790         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1791 
1792         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1793                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1794                          VecCUDAEqualsReverse());
1795       }
1796     }
1797 
1798     /* csr_spmv does y = alpha*Ax + beta*y */
1799     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1800       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1801       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1802                                mat->num_rows, mat->num_cols,
1803                                mat->num_entries, matstruct->alpha, matstruct->descr,
1804                                mat->values->data().get(), mat->row_offsets->data().get(),
1805                                mat->column_indices->data().get(), xptr, beta,
1806                                dptr);CHKERRCUSPARSE(stat);
1807     } else {
1808       if (cusparsestruct->nrows) {
1809         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1810         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1811                                  matstruct->alpha, matstruct->descr, hybMat,
1812                                  xptr, beta,
1813                                  dptr);CHKERRCUSPARSE(stat);
1814       }
1815     }
1816     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1817     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1818 
1819     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1820       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1821         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1822           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1823         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1824           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1825         }
1826       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1827         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1828       }
1829 
1830       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1831       if (compressed) {
1832         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1833 
1834         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1835         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1836                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1837                          VecCUDAPlusEquals());
1838         cerr = WaitForCUDA();CHKERRCUDA(cerr);
1839         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1840       }
1841     } else {
1842       if (yy && yy != zz) {
1843         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1844       }
1845     }
1846     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1847     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1848     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1849   } catch(char *ex) {
1850     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1851   }
1852   if (yy) {
1853     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1854   } else {
1855     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1856   }
1857   PetscFunctionReturn(0);
1858 }
1859 
1860 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1861 {
1862   PetscErrorCode ierr;
1863 
1864   PetscFunctionBegin;
1865   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1866   PetscFunctionReturn(0);
1867 }
1868 
1869 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1870 {
1871   PetscErrorCode ierr;
1872 
1873   PetscFunctionBegin;
1874   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1875   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1876   if (A->factortype == MAT_FACTOR_NONE) {
1877     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1878   }
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 /* --------------------------------------------------------------------------------*/
1883 /*@
1884    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1885    (the default parallel PETSc format). This matrix will ultimately pushed down
1886    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1887    assembly performance the user should preallocate the matrix storage by setting
1888    the parameter nz (or the array nnz).  By setting these parameters accurately,
1889    performance during matrix assembly can be increased by more than a factor of 50.
1890 
1891    Collective
1892 
1893    Input Parameters:
1894 +  comm - MPI communicator, set to PETSC_COMM_SELF
1895 .  m - number of rows
1896 .  n - number of columns
1897 .  nz - number of nonzeros per row (same for all rows)
1898 -  nnz - array containing the number of nonzeros in the various rows
1899          (possibly different for each row) or NULL
1900 
1901    Output Parameter:
1902 .  A - the matrix
1903 
1904    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1905    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1906    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1907 
1908    Notes:
1909    If nnz is given then nz is ignored
1910 
1911    The AIJ format (also called the Yale sparse matrix format or
1912    compressed row storage), is fully compatible with standard Fortran 77
1913    storage.  That is, the stored row and column indices can begin at
1914    either one (as in Fortran) or zero.  See the users' manual for details.
1915 
1916    Specify the preallocated storage with either nz or nnz (not both).
1917    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1918    allocation.  For large problems you MUST preallocate memory or you
1919    will get TERRIBLE performance, see the users' manual chapter on matrices.
1920 
1921    By default, this format uses inodes (identical nodes) when possible, to
1922    improve numerical efficiency of matrix-vector products and solves. We
1923    search for consecutive rows with the same nonzero structure, thereby
1924    reusing matrix information to achieve increased efficiency.
1925 
1926    Level: intermediate
1927 
1928 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1929 @*/
1930 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1931 {
1932   PetscErrorCode ierr;
1933 
1934   PetscFunctionBegin;
1935   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1936   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1937   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1938   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1939   PetscFunctionReturn(0);
1940 }
1941 
1942 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1943 {
1944   PetscErrorCode ierr;
1945 
1946   PetscFunctionBegin;
1947   if (A->factortype == MAT_FACTOR_NONE) {
1948     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1949   } else {
1950     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1951   }
1952   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1953   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1954   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1955   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1956   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1957   PetscFunctionReturn(0);
1958 }
1959 
1960 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1961 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1962 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1963 {
1964   PetscErrorCode ierr;
1965 
1966   PetscFunctionBegin;
1967   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1968   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1969   PetscFunctionReturn(0);
1970 }
1971 
1972 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1973 {
1974   PetscErrorCode ierr;
1975 
1976   PetscFunctionBegin;
1977   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1978   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1979      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1980      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
1981      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1982            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1983   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1984   /* TODO: add support for this? */
1985   if (flg) {
1986     A->ops->mult                      = MatMult_SeqAIJ;
1987     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1988     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1989     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1990     A->ops->multhermitiantranspose    = NULL;
1991     A->ops->multhermitiantransposeadd = NULL;
1992     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1993     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1994   } else {
1995     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
1996     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
1997     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
1998     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1999     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2000     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2001     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2002     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2003   }
2004   A->boundtocpu = flg;
2005   PetscFunctionReturn(0);
2006 }
2007 
2008 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2009 {
2010   PetscErrorCode   ierr;
2011   cusparseStatus_t stat;
2012   Mat              B;
2013 
2014   PetscFunctionBegin;
2015   if (reuse == MAT_INITIAL_MATRIX) {
2016     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2017   } else if (reuse == MAT_REUSE_MATRIX) {
2018     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2019   }
2020   B = *newmat;
2021 
2022   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2023   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2024 
2025   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2026     if (B->factortype == MAT_FACTOR_NONE) {
2027       Mat_SeqAIJCUSPARSE *spptr;
2028 
2029       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2030       spptr->format = MAT_CUSPARSE_CSR;
2031       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2032       B->spptr = spptr;
2033     } else {
2034       Mat_SeqAIJCUSPARSETriFactors *spptr;
2035 
2036       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2037       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2038       B->spptr = spptr;
2039     }
2040     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2041   }
2042   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2043   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2044   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2045   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2046   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2047 
2048   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2049   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2050   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2051   PetscFunctionReturn(0);
2052 }
2053 
2054 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2055 {
2056   PetscErrorCode ierr;
2057 
2058   PetscFunctionBegin;
2059   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2060   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2061   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 /*MC
2066    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2067 
2068    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2069    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2070    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2071 
2072    Options Database Keys:
2073 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2074 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2075 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2076 
2077   Level: beginner
2078 
2079 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2080 M*/
2081 
2082 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2083 
2084 
2085 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2086 {
2087   PetscErrorCode ierr;
2088 
2089   PetscFunctionBegin;
2090   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2091   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2092   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2093   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2098 {
2099   PetscErrorCode   ierr;
2100   cusparseStatus_t stat;
2101   cusparseHandle_t handle;
2102 
2103   PetscFunctionBegin;
2104   if (*cusparsestruct) {
2105     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2106     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2107     delete (*cusparsestruct)->workVector;
2108     delete (*cusparsestruct)->rowoffsets_gpu;
2109     if (handle = (*cusparsestruct)->handle) {
2110       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2111     }
2112     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2113   }
2114   PetscFunctionReturn(0);
2115 }
2116 
2117 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2118 {
2119   PetscFunctionBegin;
2120   if (*mat) {
2121     delete (*mat)->values;
2122     delete (*mat)->column_indices;
2123     delete (*mat)->row_offsets;
2124     delete *mat;
2125     *mat = 0;
2126   }
2127   PetscFunctionReturn(0);
2128 }
2129 
2130 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2131 {
2132   cusparseStatus_t stat;
2133   PetscErrorCode   ierr;
2134 
2135   PetscFunctionBegin;
2136   if (*trifactor) {
2137     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2138     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2139     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2140     delete *trifactor;
2141     *trifactor = 0;
2142   }
2143   PetscFunctionReturn(0);
2144 }
2145 
2146 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2147 {
2148   CsrMatrix        *mat;
2149   cusparseStatus_t stat;
2150   cudaError_t      err;
2151 
2152   PetscFunctionBegin;
2153   if (*matstruct) {
2154     if ((*matstruct)->mat) {
2155       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2156         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2157         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2158       } else {
2159         mat = (CsrMatrix*)(*matstruct)->mat;
2160         CsrMatrix_Destroy(&mat);
2161       }
2162     }
2163     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2164     delete (*matstruct)->cprowIndices;
2165     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2166     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2167     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2168     delete *matstruct;
2169     *matstruct = 0;
2170   }
2171   PetscFunctionReturn(0);
2172 }
2173 
2174 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2175 {
2176   PetscErrorCode ierr;
2177 
2178   PetscFunctionBegin;
2179   if (*trifactors) {
2180     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2181     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2182     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2183     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2184     delete (*trifactors)->rpermIndices;
2185     delete (*trifactors)->cpermIndices;
2186     delete (*trifactors)->workVector;
2187     (*trifactors)->rpermIndices = 0;
2188     (*trifactors)->cpermIndices = 0;
2189     (*trifactors)->workVector = 0;
2190   }
2191   PetscFunctionReturn(0);
2192 }
2193 
2194 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2195 {
2196   PetscErrorCode   ierr;
2197   cusparseHandle_t handle;
2198   cusparseStatus_t stat;
2199 
2200   PetscFunctionBegin;
2201   if (*trifactors) {
2202     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2203     if (handle = (*trifactors)->handle) {
2204       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2205     }
2206     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2207   }
2208   PetscFunctionReturn(0);
2209 }
2210