xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 348a1646b1a7771dc468a628d08efb39f735eb8e)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
39 
40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
46 
47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48 {
49   cusparseStatus_t   stat;
50   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
51 
52   PetscFunctionBegin;
53   cusparsestruct->stream = stream;
54   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55   PetscFunctionReturn(0);
56 }
57 
58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59 {
60   cusparseStatus_t   stat;
61   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62 
63   PetscFunctionBegin;
64   if (cusparsestruct->handle != handle) {
65     if (cusparsestruct->handle) {
66       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
67     }
68     cusparsestruct->handle = handle;
69   }
70   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75 {
76   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
77 
78   PetscFunctionBegin;
79   if (cusparsestruct->handle) cusparsestruct->handle = 0;
80   PetscFunctionReturn(0);
81 }
82 
83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
129 {
130   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
131 
132   PetscFunctionBegin;
133   switch (op) {
134   case MAT_CUSPARSE_MULT:
135     cusparsestruct->format = format;
136     break;
137   case MAT_CUSPARSE_ALL:
138     cusparsestruct->format = format;
139     break;
140   default:
141     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
142   }
143   PetscFunctionReturn(0);
144 }
145 
146 /*@
147    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
148    operation. Only the MatMult operation can use different GPU storage formats
149    for MPIAIJCUSPARSE matrices.
150    Not Collective
151 
152    Input Parameters:
153 +  A - Matrix of type SEQAIJCUSPARSE
154 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
155 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
156 
157    Output Parameter:
158 
159    Level: intermediate
160 
161 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
162 @*/
163 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
164 {
165   PetscErrorCode ierr;
166 
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 /*@
174    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
175 
176    Collective on mat
177 
178    Input Parameters:
179 +  A - Matrix of type SEQAIJCUSPARSE
180 -  transgen - the boolean flag
181 
182    Level: intermediate
183 
184 .seealso: MATSEQAIJCUSPARSE
185 @*/
186 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
187 {
188   PetscErrorCode ierr;
189   PetscBool      flg;
190 
191   PetscFunctionBegin;
192   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
193   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
194   if (flg) {
195     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
196 
197     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
198     cusp->transgen = transgen;
199     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
200       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
201     }
202   }
203   PetscFunctionReturn(0);
204 }
205 
206 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
207 {
208   PetscErrorCode           ierr;
209   MatCUSPARSEStorageFormat format;
210   PetscBool                flg;
211   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
212 
213   PetscFunctionBegin;
214   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
215   if (A->factortype == MAT_FACTOR_NONE) {
216     PetscBool transgen = cusparsestruct->transgen;
217 
218     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
219     if (flg) {
220       ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);
221     }
222     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
223                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
224     if (flg) {
225       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
226     }
227   }
228   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
229                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
230   if (flg) {
231     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
232   }
233   ierr = PetscOptionsTail();CHKERRQ(ierr);
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
238 {
239   PetscErrorCode ierr;
240 
241   PetscFunctionBegin;
242   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
243   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
244   PetscFunctionReturn(0);
245 }
246 
247 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
248 {
249   PetscErrorCode ierr;
250 
251   PetscFunctionBegin;
252   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
253   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
254   PetscFunctionReturn(0);
255 }
256 
257 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
258 {
259   PetscErrorCode ierr;
260 
261   PetscFunctionBegin;
262   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
263   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
264   PetscFunctionReturn(0);
265 }
266 
267 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
268 {
269   PetscErrorCode ierr;
270 
271   PetscFunctionBegin;
272   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
273   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
274   PetscFunctionReturn(0);
275 }
276 
277 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
278 {
279   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
280   PetscInt                          n = A->rmap->n;
281   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
282   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
283   cusparseStatus_t                  stat;
284   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
285   const MatScalar                   *aa = a->a,*v;
286   PetscInt                          *AiLo, *AjLo;
287   PetscScalar                       *AALo;
288   PetscInt                          i,nz, nzLower, offset, rowOffset;
289   PetscErrorCode                    ierr;
290   cudaError_t                       cerr;
291 
292   PetscFunctionBegin;
293   if (!n) PetscFunctionReturn(0);
294   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
295     try {
296       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
297       nzLower=n+ai[n]-ai[1];
298 
299       /* Allocate Space for the lower triangular matrix */
300       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
301       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
302       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
303 
304       /* Fill the lower triangular matrix */
305       AiLo[0]  = (PetscInt) 0;
306       AiLo[n]  = nzLower;
307       AjLo[0]  = (PetscInt) 0;
308       AALo[0]  = (MatScalar) 1.0;
309       v        = aa;
310       vi       = aj;
311       offset   = 1;
312       rowOffset= 1;
313       for (i=1; i<n; i++) {
314         nz = ai[i+1] - ai[i];
315         /* additional 1 for the term on the diagonal */
316         AiLo[i]    = rowOffset;
317         rowOffset += nz+1;
318 
319         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
320         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
321 
322         offset      += nz;
323         AjLo[offset] = (PetscInt) i;
324         AALo[offset] = (MatScalar) 1.0;
325         offset      += 1;
326 
327         v  += nz;
328         vi += nz;
329       }
330 
331       /* allocate space for the triangular factor information */
332       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
333 
334       /* Create the matrix description */
335       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
336       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
337       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
338       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
339       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
340 
341       /* Create the solve analysis information */
342       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
343 
344       /* set the operation */
345       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
346 
347       /* set the matrix */
348       loTriFactor->csrMat = new CsrMatrix;
349       loTriFactor->csrMat->num_rows = n;
350       loTriFactor->csrMat->num_cols = n;
351       loTriFactor->csrMat->num_entries = nzLower;
352 
353       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
354       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
355 
356       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
357       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
358 
359       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
360       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
361 
362       /* perform the solve analysis */
363       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
364                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
365                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
366                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
367 
368       /* assign the pointer. Is this really necessary? */
369       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
370 
371       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
372       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
373       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
374       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
375     } catch(char *ex) {
376       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
377     }
378   }
379   PetscFunctionReturn(0);
380 }
381 
382 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
383 {
384   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
385   PetscInt                          n = A->rmap->n;
386   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
387   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
388   cusparseStatus_t                  stat;
389   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
390   const MatScalar                   *aa = a->a,*v;
391   PetscInt                          *AiUp, *AjUp;
392   PetscScalar                       *AAUp;
393   PetscInt                          i,nz, nzUpper, offset;
394   PetscErrorCode                    ierr;
395   cudaError_t                       cerr;
396 
397   PetscFunctionBegin;
398   if (!n) PetscFunctionReturn(0);
399   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
400     try {
401       /* next, figure out the number of nonzeros in the upper triangular matrix. */
402       nzUpper = adiag[0]-adiag[n];
403 
404       /* Allocate Space for the upper triangular matrix */
405       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
406       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
407       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
408 
409       /* Fill the upper triangular matrix */
410       AiUp[0]=(PetscInt) 0;
411       AiUp[n]=nzUpper;
412       offset = nzUpper;
413       for (i=n-1; i>=0; i--) {
414         v  = aa + adiag[i+1] + 1;
415         vi = aj + adiag[i+1] + 1;
416 
417         /* number of elements NOT on the diagonal */
418         nz = adiag[i] - adiag[i+1]-1;
419 
420         /* decrement the offset */
421         offset -= (nz+1);
422 
423         /* first, set the diagonal elements */
424         AjUp[offset] = (PetscInt) i;
425         AAUp[offset] = (MatScalar)1./v[nz];
426         AiUp[i]      = AiUp[i+1] - (nz+1);
427 
428         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
429         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
430       }
431 
432       /* allocate space for the triangular factor information */
433       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
434 
435       /* Create the matrix description */
436       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
437       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
438       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
439       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
440       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
441 
442       /* Create the solve analysis information */
443       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
444 
445       /* set the operation */
446       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
447 
448       /* set the matrix */
449       upTriFactor->csrMat = new CsrMatrix;
450       upTriFactor->csrMat->num_rows = n;
451       upTriFactor->csrMat->num_cols = n;
452       upTriFactor->csrMat->num_entries = nzUpper;
453 
454       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
455       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
456 
457       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
458       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
459 
460       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
461       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
462 
463       /* perform the solve analysis */
464       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
465                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
466                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
467                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
468 
469       /* assign the pointer. Is this really necessary? */
470       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
471 
472       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
473       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
474       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
475       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
476     } catch(char *ex) {
477       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
478     }
479   }
480   PetscFunctionReturn(0);
481 }
482 
483 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
484 {
485   PetscErrorCode               ierr;
486   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
487   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
488   IS                           isrow = a->row,iscol = a->icol;
489   PetscBool                    row_identity,col_identity;
490   const PetscInt               *r,*c;
491   PetscInt                     n = A->rmap->n;
492 
493   PetscFunctionBegin;
494   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
495   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
496   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
497 
498   cusparseTriFactors->workVector = new THRUSTARRAY(n);
499   cusparseTriFactors->nnz=a->nz;
500 
501   A->offloadmask = PETSC_OFFLOAD_BOTH;
502   /* lower triangular indices */
503   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
504   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
505   if (!row_identity) {
506     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
507     cusparseTriFactors->rpermIndices->assign(r, r+n);
508   }
509   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
510 
511   /* upper triangular indices */
512   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
513   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
514   if (!col_identity) {
515     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
516     cusparseTriFactors->cpermIndices->assign(c, c+n);
517   }
518 
519   if (!row_identity && !col_identity) {
520     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
521   } else if(!row_identity) {
522     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
523   } else if(!col_identity) {
524     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
525   }
526 
527   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 
531 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
532 {
533   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
534   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
535   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
536   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
537   cusparseStatus_t                  stat;
538   PetscErrorCode                    ierr;
539   cudaError_t                       cerr;
540   PetscInt                          *AiUp, *AjUp;
541   PetscScalar                       *AAUp;
542   PetscScalar                       *AALo;
543   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
544   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
545   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
546   const MatScalar                   *aa = b->a,*v;
547 
548   PetscFunctionBegin;
549   if (!n) PetscFunctionReturn(0);
550   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
551     try {
552       /* Allocate Space for the upper triangular matrix */
553       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
554       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
555       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
556       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
557 
558       /* Fill the upper triangular matrix */
559       AiUp[0]=(PetscInt) 0;
560       AiUp[n]=nzUpper;
561       offset = 0;
562       for (i=0; i<n; i++) {
563         /* set the pointers */
564         v  = aa + ai[i];
565         vj = aj + ai[i];
566         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
567 
568         /* first, set the diagonal elements */
569         AjUp[offset] = (PetscInt) i;
570         AAUp[offset] = (MatScalar)1.0/v[nz];
571         AiUp[i]      = offset;
572         AALo[offset] = (MatScalar)1.0/v[nz];
573 
574         offset+=1;
575         if (nz>0) {
576           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
577           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
578           for (j=offset; j<offset+nz; j++) {
579             AAUp[j] = -AAUp[j];
580             AALo[j] = AAUp[j]/v[nz];
581           }
582           offset+=nz;
583         }
584       }
585 
586       /* allocate space for the triangular factor information */
587       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
588 
589       /* Create the matrix description */
590       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
591       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
592       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
593       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
594       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
595 
596       /* Create the solve analysis information */
597       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
598 
599       /* set the operation */
600       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
601 
602       /* set the matrix */
603       upTriFactor->csrMat = new CsrMatrix;
604       upTriFactor->csrMat->num_rows = A->rmap->n;
605       upTriFactor->csrMat->num_cols = A->cmap->n;
606       upTriFactor->csrMat->num_entries = a->nz;
607 
608       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
609       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
610 
611       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
612       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
613 
614       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
615       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
616 
617       /* perform the solve analysis */
618       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
619                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
620                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
621                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622 
623       /* assign the pointer. Is this really necessary? */
624       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
625 
626       /* allocate space for the triangular factor information */
627       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
628 
629       /* Create the matrix description */
630       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
631       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
632       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
633       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
634       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
635 
636       /* Create the solve analysis information */
637       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
638 
639       /* set the operation */
640       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
641 
642       /* set the matrix */
643       loTriFactor->csrMat = new CsrMatrix;
644       loTriFactor->csrMat->num_rows = A->rmap->n;
645       loTriFactor->csrMat->num_cols = A->cmap->n;
646       loTriFactor->csrMat->num_entries = a->nz;
647 
648       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
649       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
650 
651       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
652       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
653 
654       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
655       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
656       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
657 
658       /* perform the solve analysis */
659       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
660                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
661                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
662                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
663 
664       /* assign the pointer. Is this really necessary? */
665       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
666 
667       A->offloadmask = PETSC_OFFLOAD_BOTH;
668       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
669       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
670       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
671       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
672     } catch(char *ex) {
673       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
674     }
675   }
676   PetscFunctionReturn(0);
677 }
678 
679 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
680 {
681   PetscErrorCode               ierr;
682   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
683   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
684   IS                           ip = a->row;
685   const PetscInt               *rip;
686   PetscBool                    perm_identity;
687   PetscInt                     n = A->rmap->n;
688 
689   PetscFunctionBegin;
690   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
691   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
692   cusparseTriFactors->workVector = new THRUSTARRAY(n);
693   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
694 
695   /* lower triangular indices */
696   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
697   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
698   if (!perm_identity) {
699     IS             iip;
700     const PetscInt *irip;
701 
702     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
703     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
704     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
705     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
706     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
707     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
708     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
709     ierr = ISDestroy(&iip);CHKERRQ(ierr);
710     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
711   }
712   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
713   PetscFunctionReturn(0);
714 }
715 
716 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
717 {
718   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
719   IS             isrow = b->row,iscol = b->col;
720   PetscBool      row_identity,col_identity;
721   PetscErrorCode ierr;
722 
723   PetscFunctionBegin;
724   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
725   B->offloadmask = PETSC_OFFLOAD_CPU;
726   /* determine which version of MatSolve needs to be used. */
727   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
728   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
729   if (row_identity && col_identity) {
730     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
731     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
732     B->ops->matsolve = NULL;
733     B->ops->matsolvetranspose = NULL;
734   } else {
735     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
736     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
737     B->ops->matsolve = NULL;
738     B->ops->matsolvetranspose = NULL;
739   }
740 
741   /* get the triangular factors */
742   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
743   PetscFunctionReturn(0);
744 }
745 
746 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
747 {
748   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
749   IS             ip = b->row;
750   PetscBool      perm_identity;
751   PetscErrorCode ierr;
752 
753   PetscFunctionBegin;
754   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
755   B->offloadmask = PETSC_OFFLOAD_CPU;
756   /* determine which version of MatSolve needs to be used. */
757   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
758   if (perm_identity) {
759     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
760     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
761     B->ops->matsolve = NULL;
762     B->ops->matsolvetranspose = NULL;
763   } else {
764     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
765     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
766     B->ops->matsolve = NULL;
767     B->ops->matsolvetranspose = NULL;
768   }
769 
770   /* get the triangular factors */
771   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
776 {
777   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
778   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
779   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
780   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
781   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
782   cusparseStatus_t                  stat;
783   cusparseIndexBase_t               indexBase;
784   cusparseMatrixType_t              matrixType;
785   cusparseFillMode_t                fillMode;
786   cusparseDiagType_t                diagType;
787 
788   PetscFunctionBegin;
789 
790   /*********************************************/
791   /* Now the Transpose of the Lower Tri Factor */
792   /*********************************************/
793 
794   /* allocate space for the transpose of the lower triangular factor */
795   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
796 
797   /* set the matrix descriptors of the lower triangular factor */
798   matrixType = cusparseGetMatType(loTriFactor->descr);
799   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
800   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
801     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
802   diagType = cusparseGetMatDiagType(loTriFactor->descr);
803 
804   /* Create the matrix description */
805   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
806   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
807   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
808   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
809   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
810 
811   /* Create the solve analysis information */
812   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
813 
814   /* set the operation */
815   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
816 
817   /* allocate GPU space for the CSC of the lower triangular factor*/
818   loTriFactorT->csrMat = new CsrMatrix;
819   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
820   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
821   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
822   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
823   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
824   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
825 
826   /* compute the transpose of the lower triangular factor, i.e. the CSC */
827   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
828                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
829                           loTriFactor->csrMat->values->data().get(),
830                           loTriFactor->csrMat->row_offsets->data().get(),
831                           loTriFactor->csrMat->column_indices->data().get(),
832                           loTriFactorT->csrMat->values->data().get(),
833                           loTriFactorT->csrMat->column_indices->data().get(),
834                           loTriFactorT->csrMat->row_offsets->data().get(),
835                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
836 
837   /* perform the solve analysis on the transposed matrix */
838   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
839                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
840                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
841                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
842                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
843 
844   /* assign the pointer. Is this really necessary? */
845   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
846 
847   /*********************************************/
848   /* Now the Transpose of the Upper Tri Factor */
849   /*********************************************/
850 
851   /* allocate space for the transpose of the upper triangular factor */
852   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
853 
854   /* set the matrix descriptors of the upper triangular factor */
855   matrixType = cusparseGetMatType(upTriFactor->descr);
856   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
857   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
858     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
859   diagType = cusparseGetMatDiagType(upTriFactor->descr);
860 
861   /* Create the matrix description */
862   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
863   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
864   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
865   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
866   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
867 
868   /* Create the solve analysis information */
869   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
870 
871   /* set the operation */
872   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
873 
874   /* allocate GPU space for the CSC of the upper triangular factor*/
875   upTriFactorT->csrMat = new CsrMatrix;
876   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
877   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
878   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
879   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
880   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
881   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
882 
883   /* compute the transpose of the upper triangular factor, i.e. the CSC */
884   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
885                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
886                           upTriFactor->csrMat->values->data().get(),
887                           upTriFactor->csrMat->row_offsets->data().get(),
888                           upTriFactor->csrMat->column_indices->data().get(),
889                           upTriFactorT->csrMat->values->data().get(),
890                           upTriFactorT->csrMat->column_indices->data().get(),
891                           upTriFactorT->csrMat->row_offsets->data().get(),
892                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
893 
894   /* perform the solve analysis on the transposed matrix */
895   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
896                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
897                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
898                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
899                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
900 
901   /* assign the pointer. Is this really necessary? */
902   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
903   PetscFunctionReturn(0);
904 }
905 
906 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
907 {
908   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
909   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
910   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
911   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
912   cusparseStatus_t             stat;
913   cusparseIndexBase_t          indexBase;
914   cudaError_t                  err;
915   PetscErrorCode               ierr;
916 
917   PetscFunctionBegin;
918   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
919   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
920   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
921   /* create cusparse matrix */
922   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
923   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
924   indexBase = cusparseGetMatIndexBase(matstruct->descr);
925   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
926   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
927 
928   /* set alpha and beta */
929   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
930   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
931   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
932   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
933   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
934   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
935   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
936 
937   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
938     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
939     CsrMatrix *matrixT= new CsrMatrix;
940     matrixT->num_rows = A->cmap->n;
941     matrixT->num_cols = A->rmap->n;
942     matrixT->num_entries = a->nz;
943     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
944     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
945     matrixT->values = new THRUSTARRAY(a->nz);
946 
947     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
948     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
949     /* compute the transpose, i.e. the CSC */
950     indexBase = cusparseGetMatIndexBase(matstruct->descr);
951     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
952                             A->cmap->n, matrix->num_entries,
953                             matrix->values->data().get(),
954                             cusparsestruct->rowoffsets_gpu->data().get(),
955                             matrix->column_indices->data().get(),
956                             matrixT->values->data().get(),
957                             matrixT->column_indices->data().get(),
958                             matrixT->row_offsets->data().get(),
959                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
960     matstructT->mat = matrixT;
961   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
962     CsrMatrix *temp = new CsrMatrix;
963     CsrMatrix *tempT = new CsrMatrix;
964 
965     /* First convert HYB to CSR */
966     temp->num_rows = A->rmap->n;
967     temp->num_cols = A->cmap->n;
968     temp->num_entries = a->nz;
969     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
970     temp->column_indices = new THRUSTINTARRAY32(a->nz);
971     temp->values = new THRUSTARRAY(a->nz);
972 
973     stat = cusparse_hyb2csr(cusparsestruct->handle,
974                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
975                             temp->values->data().get(),
976                             temp->row_offsets->data().get(),
977                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
978 
979     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
980     tempT->num_rows = A->rmap->n;
981     tempT->num_cols = A->cmap->n;
982     tempT->num_entries = a->nz;
983     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
984     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
985     tempT->values = new THRUSTARRAY(a->nz);
986 
987     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
988                             temp->num_cols, temp->num_entries,
989                             temp->values->data().get(),
990                             temp->row_offsets->data().get(),
991                             temp->column_indices->data().get(),
992                             tempT->values->data().get(),
993                             tempT->column_indices->data().get(),
994                             tempT->row_offsets->data().get(),
995                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
996 
997     /* Last, convert CSC to HYB */
998     cusparseHybMat_t hybMat;
999     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1000     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1001       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1002     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1003                             matstructT->descr, tempT->values->data().get(),
1004                             tempT->row_offsets->data().get(),
1005                             tempT->column_indices->data().get(),
1006                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1007 
1008     /* assign the pointer */
1009     matstructT->mat = hybMat;
1010     /* delete temporaries */
1011     if (tempT) {
1012       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1013       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1014       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1015       delete (CsrMatrix*) tempT;
1016     }
1017     if (temp) {
1018       if (temp->values) delete (THRUSTARRAY*) temp->values;
1019       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1020       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1021       delete (CsrMatrix*) temp;
1022     }
1023   }
1024   err  = WaitForGPU();CHKERRCUDA(err);
1025   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1026   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1027   /* the compressed row indices is not used for matTranspose */
1028   matstructT->cprowIndices = NULL;
1029   /* assign the pointer */
1030   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1035 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1036 {
1037   PetscInt                              n = xx->map->n;
1038   const PetscScalar                     *barray;
1039   PetscScalar                           *xarray;
1040   thrust::device_ptr<const PetscScalar> bGPU;
1041   thrust::device_ptr<PetscScalar>       xGPU;
1042   cusparseStatus_t                      stat;
1043   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1044   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1045   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1046   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1047   PetscErrorCode                        ierr;
1048   cudaError_t                           cerr;
1049 
1050   PetscFunctionBegin;
1051   /* Analyze the matrix and create the transpose ... on the fly */
1052   if (!loTriFactorT && !upTriFactorT) {
1053     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1054     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1055     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1056   }
1057 
1058   /* Get the GPU pointers */
1059   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1060   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1061   xGPU = thrust::device_pointer_cast(xarray);
1062   bGPU = thrust::device_pointer_cast(barray);
1063 
1064   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1065   /* First, reorder with the row permutation */
1066   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1067                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1068                xGPU);
1069 
1070   /* First, solve U */
1071   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1072                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1073                         upTriFactorT->csrMat->values->data().get(),
1074                         upTriFactorT->csrMat->row_offsets->data().get(),
1075                         upTriFactorT->csrMat->column_indices->data().get(),
1076                         upTriFactorT->solveInfo,
1077                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1078 
1079   /* Then, solve L */
1080   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1082                         loTriFactorT->csrMat->values->data().get(),
1083                         loTriFactorT->csrMat->row_offsets->data().get(),
1084                         loTriFactorT->csrMat->column_indices->data().get(),
1085                         loTriFactorT->solveInfo,
1086                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1087 
1088   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1089   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1090                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1091                tempGPU->begin());
1092 
1093   /* Copy the temporary to the full solution. */
1094   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1095 
1096   /* restore */
1097   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1098   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1099   cerr = WaitForGPU();CHKERRCUDA(cerr);
1100   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1101   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1106 {
1107   const PetscScalar                 *barray;
1108   PetscScalar                       *xarray;
1109   cusparseStatus_t                  stat;
1110   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1111   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1112   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1113   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1114   PetscErrorCode                    ierr;
1115   cudaError_t                       cerr;
1116 
1117   PetscFunctionBegin;
1118   /* Analyze the matrix and create the transpose ... on the fly */
1119   if (!loTriFactorT && !upTriFactorT) {
1120     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1121     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1122     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1123   }
1124 
1125   /* Get the GPU pointers */
1126   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1127   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1128 
1129   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1130   /* First, solve U */
1131   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1132                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1133                         upTriFactorT->csrMat->values->data().get(),
1134                         upTriFactorT->csrMat->row_offsets->data().get(),
1135                         upTriFactorT->csrMat->column_indices->data().get(),
1136                         upTriFactorT->solveInfo,
1137                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1138 
1139   /* Then, solve L */
1140   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1141                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1142                         loTriFactorT->csrMat->values->data().get(),
1143                         loTriFactorT->csrMat->row_offsets->data().get(),
1144                         loTriFactorT->csrMat->column_indices->data().get(),
1145                         loTriFactorT->solveInfo,
1146                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1147 
1148   /* restore */
1149   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1150   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1151   cerr = WaitForGPU();CHKERRCUDA(cerr);
1152   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1153   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1154   PetscFunctionReturn(0);
1155 }
1156 
1157 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1158 {
1159   const PetscScalar                     *barray;
1160   PetscScalar                           *xarray;
1161   thrust::device_ptr<const PetscScalar> bGPU;
1162   thrust::device_ptr<PetscScalar>       xGPU;
1163   cusparseStatus_t                      stat;
1164   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1165   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1166   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1167   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1168   PetscErrorCode                        ierr;
1169   cudaError_t                           cerr;
1170 
1171   PetscFunctionBegin;
1172 
1173   /* Get the GPU pointers */
1174   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1175   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1176   xGPU = thrust::device_pointer_cast(xarray);
1177   bGPU = thrust::device_pointer_cast(barray);
1178 
1179   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1180   /* First, reorder with the row permutation */
1181   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1182                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1183                tempGPU->begin());
1184 
1185   /* Next, solve L */
1186   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1187                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1188                         loTriFactor->csrMat->values->data().get(),
1189                         loTriFactor->csrMat->row_offsets->data().get(),
1190                         loTriFactor->csrMat->column_indices->data().get(),
1191                         loTriFactor->solveInfo,
1192                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1193 
1194   /* Then, solve U */
1195   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1196                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1197                         upTriFactor->csrMat->values->data().get(),
1198                         upTriFactor->csrMat->row_offsets->data().get(),
1199                         upTriFactor->csrMat->column_indices->data().get(),
1200                         upTriFactor->solveInfo,
1201                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1202 
1203   /* Last, reorder with the column permutation */
1204   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1205                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1206                xGPU);
1207 
1208   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1209   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1210   cerr = WaitForGPU();CHKERRCUDA(cerr);
1211   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1212   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1213   PetscFunctionReturn(0);
1214 }
1215 
1216 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1217 {
1218   const PetscScalar                 *barray;
1219   PetscScalar                       *xarray;
1220   cusparseStatus_t                  stat;
1221   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1222   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1223   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1224   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1225   PetscErrorCode                    ierr;
1226   cudaError_t                       cerr;
1227 
1228   PetscFunctionBegin;
1229   /* Get the GPU pointers */
1230   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1231   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1232 
1233   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1234   /* First, solve L */
1235   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1236                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1237                         loTriFactor->csrMat->values->data().get(),
1238                         loTriFactor->csrMat->row_offsets->data().get(),
1239                         loTriFactor->csrMat->column_indices->data().get(),
1240                         loTriFactor->solveInfo,
1241                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1242 
1243   /* Next, solve U */
1244   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1245                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1246                         upTriFactor->csrMat->values->data().get(),
1247                         upTriFactor->csrMat->row_offsets->data().get(),
1248                         upTriFactor->csrMat->column_indices->data().get(),
1249                         upTriFactor->solveInfo,
1250                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1251 
1252   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1253   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1254   cerr = WaitForGPU();CHKERRCUDA(cerr);
1255   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1256   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1257   PetscFunctionReturn(0);
1258 }
1259 
1260 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1261 {
1262   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1263   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1264   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1265   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1266   PetscErrorCode               ierr;
1267   cusparseStatus_t             stat;
1268   cudaError_t                  err;
1269 
1270   PetscFunctionBegin;
1271   if (A->boundtocpu) PetscFunctionReturn(0);
1272   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1273     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1274       /* Copy values only */
1275       CsrMatrix *mat,*matT;
1276       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1277 
1278       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1279       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1280       mat->values->assign(a->a, a->a+a->nz);
1281       err  = WaitForGPU();CHKERRCUDA(err);
1282       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1283       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1284       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1285 
1286       /* Update matT when it was built before */
1287       if (cusparsestruct->matTranspose) {
1288         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1289         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1290         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1291         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1292         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1293                                  A->cmap->n, mat->num_entries,
1294                                  mat->values->data().get(),
1295                                  cusparsestruct->rowoffsets_gpu->data().get(),
1296                                  mat->column_indices->data().get(),
1297                                  matT->values->data().get(),
1298                                  matT->column_indices->data().get(),
1299                                  matT->row_offsets->data().get(),
1300                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1301         err  = WaitForGPU();CHKERRCUDA(err);
1302         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1303         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1304       }
1305     } else {
1306       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1307       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1308       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1309       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1310       delete cusparsestruct->workVector;
1311       delete cusparsestruct->rowoffsets_gpu;
1312       try {
1313         if (a->compressedrow.use) {
1314           m    = a->compressedrow.nrows;
1315           ii   = a->compressedrow.i;
1316           ridx = a->compressedrow.rindex;
1317         } else {
1318           m    = A->rmap->n;
1319           ii   = a->i;
1320           ridx = NULL;
1321         }
1322         cusparsestruct->nrows = m;
1323 
1324         /* create cusparse matrix */
1325         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1326         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1327         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1328         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1329 
1330         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1331         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1332         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1333         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1334         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1335         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1336         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1337 
1338         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1339         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1340           /* set the matrix */
1341           CsrMatrix *matrix= new CsrMatrix;
1342           matrix->num_rows = m;
1343           matrix->num_cols = A->cmap->n;
1344           matrix->num_entries = a->nz;
1345           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1346           matrix->row_offsets->assign(ii, ii + m+1);
1347 
1348           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1349           matrix->column_indices->assign(a->j, a->j+a->nz);
1350 
1351           matrix->values = new THRUSTARRAY(a->nz);
1352           matrix->values->assign(a->a, a->a+a->nz);
1353 
1354           /* assign the pointer */
1355           matstruct->mat = matrix;
1356 
1357         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1358           CsrMatrix *matrix= new CsrMatrix;
1359           matrix->num_rows = m;
1360           matrix->num_cols = A->cmap->n;
1361           matrix->num_entries = a->nz;
1362           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1363           matrix->row_offsets->assign(ii, ii + m+1);
1364 
1365           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1366           matrix->column_indices->assign(a->j, a->j+a->nz);
1367 
1368           matrix->values = new THRUSTARRAY(a->nz);
1369           matrix->values->assign(a->a, a->a+a->nz);
1370 
1371           cusparseHybMat_t hybMat;
1372           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1373           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1374             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1375           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1376               matstruct->descr, matrix->values->data().get(),
1377               matrix->row_offsets->data().get(),
1378               matrix->column_indices->data().get(),
1379               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1380           /* assign the pointer */
1381           matstruct->mat = hybMat;
1382 
1383           if (matrix) {
1384             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1385             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1386             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1387             delete (CsrMatrix*)matrix;
1388           }
1389         }
1390 
1391         /* assign the compressed row indices */
1392         if (a->compressedrow.use) {
1393           cusparsestruct->workVector = new THRUSTARRAY(m);
1394           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1395           matstruct->cprowIndices->assign(ridx,ridx+m);
1396           tmp = m;
1397         } else {
1398           cusparsestruct->workVector = NULL;
1399           matstruct->cprowIndices    = NULL;
1400           tmp = 0;
1401         }
1402         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1403 
1404         /* assign the pointer */
1405         cusparsestruct->mat = matstruct;
1406       } catch(char *ex) {
1407         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1408       }
1409       err  = WaitForGPU();CHKERRCUDA(err);
1410       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1411       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1412       cusparsestruct->nonzerostate = A->nonzerostate;
1413     }
1414     A->offloadmask = PETSC_OFFLOAD_BOTH;
1415   }
1416   PetscFunctionReturn(0);
1417 }
1418 
1419 struct VecCUDAPlusEquals
1420 {
1421   template <typename Tuple>
1422   __host__ __device__
1423   void operator()(Tuple t)
1424   {
1425     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1426   }
1427 };
1428 
1429 struct VecCUDAEqualsReverse
1430 {
1431   template <typename Tuple>
1432   __host__ __device__
1433   void operator()(Tuple t)
1434   {
1435     thrust::get<0>(t) = thrust::get<1>(t);
1436   }
1437 };
1438 
1439 typedef struct {
1440   PetscBool   cisdense;
1441   PetscScalar *Bt;
1442   Mat         X;
1443 } MatMatCusparse;
1444 
1445 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1446 {
1447   PetscErrorCode ierr;
1448   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1449   cudaError_t    cerr;
1450 
1451   PetscFunctionBegin;
1452   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1453   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1454   ierr = PetscFree(data);CHKERRQ(ierr);
1455   PetscFunctionReturn(0);
1456 }
1457 
1458 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1459 
1460 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1461 {
1462   Mat_Product                  *product = C->product;
1463   Mat                          A,B;
1464   PetscInt                     m,n,k,blda,clda;
1465   PetscBool                    flg,biscuda;
1466   Mat_SeqAIJCUSPARSE           *cusp;
1467   cusparseStatus_t             stat;
1468   cusparseOperation_t          opA;
1469   cusparseMatDescr_t           matA;
1470   const PetscScalar            *barray;
1471   PetscScalar                  *carray;
1472   PetscErrorCode               ierr;
1473   MatMatCusparse               *mmdata;
1474   Mat_SeqAIJCUSPARSEMultStruct *mat;
1475   CsrMatrix                    *csrmat;
1476   cudaError_t                  cuer;
1477 
1478   PetscFunctionBegin;
1479   MatCheckProduct(C,1);
1480   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1481   mmdata = (MatMatCusparse*)product->data;
1482   A    = product->A;
1483   B    = product->B;
1484   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1485   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1486   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1487      Instead of silently accepting the wrong answer, I prefer to raise the error */
1488   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1489   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1490   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1491   switch (product->type) {
1492   case MATPRODUCT_AB:
1493   case MATPRODUCT_PtAP:
1494     mat = cusp->mat;
1495     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1496     m   = A->rmap->n;
1497     k   = A->cmap->n;
1498     n   = B->cmap->n;
1499     break;
1500   case MATPRODUCT_AtB:
1501     if (!cusp->transgen) {
1502       mat = cusp->mat;
1503       opA = CUSPARSE_OPERATION_TRANSPOSE;
1504     } else {
1505       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1506       mat  = cusp->matTranspose;
1507       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1508     }
1509     m = A->cmap->n;
1510     k = A->rmap->n;
1511     n = B->cmap->n;
1512     break;
1513   case MATPRODUCT_ABt:
1514   case MATPRODUCT_RARt:
1515     mat = cusp->mat;
1516     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1517     m   = A->rmap->n;
1518     k   = B->cmap->n;
1519     n   = B->rmap->n;
1520     break;
1521   default:
1522     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1523   }
1524   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1525   matA   = mat->descr;
1526   csrmat = (CsrMatrix*)mat->mat;
1527   /* if the user passed a CPU matrix, copy the data to the GPU */
1528   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1529   if (!biscuda) {
1530     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1531   }
1532   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1533   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1534   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1535     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1536     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1537   } else {
1538     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1539     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1540   }
1541 
1542   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1543 
1544   /* cusparse spmm does not support transpose on B */
1545   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1546     cublasHandle_t cublasv2handle;
1547     cublasStatus_t cerr;
1548 
1549     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1550     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1551                        B->cmap->n,B->rmap->n,
1552                        &PETSC_CUSPARSE_ONE ,barray,blda,
1553                        &PETSC_CUSPARSE_ZERO,barray,blda,
1554                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1555     blda = B->cmap->n;
1556   }
1557 
1558   /* perform the MatMat operation */
1559   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1560                            csrmat->num_entries,mat->alpha,matA,
1561                            csrmat->values->data().get(),
1562                            csrmat->row_offsets->data().get(),
1563                            csrmat->column_indices->data().get(),
1564                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1565                            carray,clda);CHKERRCUSPARSE(stat);
1566   cuer = WaitForGPU();CHKERRCUDA(cuer);
1567   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1568   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1569   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1570   if (product->type == MATPRODUCT_RARt) {
1571     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1572     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1573   } else if (product->type == MATPRODUCT_PtAP) {
1574     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1575     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1576   } else {
1577     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1578   }
1579   if (mmdata->cisdense) {
1580     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1581   }
1582   if (!biscuda) {
1583     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1584   }
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1589 {
1590   Mat_Product        *product = C->product;
1591   Mat                A,B;
1592   PetscInt           m,n;
1593   PetscBool          cisdense,flg;
1594   PetscErrorCode     ierr;
1595   MatMatCusparse     *mmdata;
1596   Mat_SeqAIJCUSPARSE *cusp;
1597 
1598   PetscFunctionBegin;
1599   MatCheckProduct(C,1);
1600   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1601   A    = product->A;
1602   B    = product->B;
1603   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1604   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1605   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1606   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1607   switch (product->type) {
1608   case MATPRODUCT_AB:
1609     m = A->rmap->n;
1610     n = B->cmap->n;
1611     break;
1612   case MATPRODUCT_AtB:
1613     m = A->cmap->n;
1614     n = B->cmap->n;
1615     break;
1616   case MATPRODUCT_ABt:
1617     m = A->rmap->n;
1618     n = B->rmap->n;
1619     break;
1620   case MATPRODUCT_PtAP:
1621     m = B->cmap->n;
1622     n = B->cmap->n;
1623     break;
1624   case MATPRODUCT_RARt:
1625     m = B->rmap->n;
1626     n = B->rmap->n;
1627     break;
1628   default:
1629     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1630   }
1631   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1632   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1633   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1634   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1635 
1636   /* product data */
1637   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1638   mmdata->cisdense = cisdense;
1639   /* cusparse spmm does not support transpose on B */
1640   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1641     cudaError_t cerr;
1642 
1643     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1644   }
1645   /* for these products we need intermediate storage */
1646   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1647     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1648     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1649     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1650       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1651     } else {
1652       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1653     }
1654   }
1655   C->product->data    = mmdata;
1656   C->product->destroy = MatDestroy_MatMatCusparse;
1657 
1658   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1659   PetscFunctionReturn(0);
1660 }
1661 
1662 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1663 
1664 /* handles dense B */
1665 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1666 {
1667   Mat_Product    *product = C->product;
1668   PetscErrorCode ierr;
1669 
1670   PetscFunctionBegin;
1671   MatCheckProduct(C,1);
1672   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1673   if (product->A->boundtocpu) {
1674     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1675     PetscFunctionReturn(0);
1676   }
1677   switch (product->type) {
1678   case MATPRODUCT_AB:
1679   case MATPRODUCT_AtB:
1680   case MATPRODUCT_ABt:
1681   case MATPRODUCT_PtAP:
1682   case MATPRODUCT_RARt:
1683     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1684   default:
1685     break;
1686   }
1687   PetscFunctionReturn(0);
1688 }
1689 
1690 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1691 {
1692   PetscErrorCode ierr;
1693 
1694   PetscFunctionBegin;
1695   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1696   PetscFunctionReturn(0);
1697 }
1698 
1699 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1700 {
1701   PetscErrorCode ierr;
1702 
1703   PetscFunctionBegin;
1704   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1705   PetscFunctionReturn(0);
1706 }
1707 
1708 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1709 {
1710   PetscErrorCode ierr;
1711 
1712   PetscFunctionBegin;
1713   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1714   PetscFunctionReturn(0);
1715 }
1716 
1717 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1718 {
1719   PetscErrorCode ierr;
1720 
1721   PetscFunctionBegin;
1722   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1727 {
1728   PetscErrorCode ierr;
1729 
1730   PetscFunctionBegin;
1731   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
1736 {
1737   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1738   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1739   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1740   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1741   PetscErrorCode               ierr;
1742   cudaError_t                  cerr;
1743   cusparseStatus_t             stat;
1744   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1745   PetscBool                    compressed;
1746 
1747   PetscFunctionBegin;
1748   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1749   if (!a->nonzerorowcnt) {
1750     if (!yy) {
1751       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1752     }
1753     PetscFunctionReturn(0);
1754   }
1755   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1756   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1757   if (!trans) {
1758     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1759   } else {
1760     if (herm || !cusparsestruct->transgen) {
1761       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1762       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1763     } else {
1764       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1765       if (!matstruct) {
1766         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1767         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1768       }
1769     }
1770   }
1771   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1772   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1773 
1774   try {
1775     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1776     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1777     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1778     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1779     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1780       xptr = xarray;
1781       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1782       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1783     } else {
1784       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1785       dptr = zarray;
1786       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1787 
1788       if (compressed) {
1789         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1790 
1791         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1792                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1793                          VecCUDAEqualsReverse());
1794       }
1795     }
1796 
1797     /* csr_spmv does y = alpha*Ax + beta*y */
1798     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1799       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1800       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1801                                mat->num_rows, mat->num_cols,
1802                                mat->num_entries, matstruct->alpha, matstruct->descr,
1803                                mat->values->data().get(), mat->row_offsets->data().get(),
1804                                mat->column_indices->data().get(), xptr, beta,
1805                                dptr);CHKERRCUSPARSE(stat);
1806     } else {
1807       if (cusparsestruct->nrows) {
1808         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1809         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1810                                  matstruct->alpha, matstruct->descr, hybMat,
1811                                  xptr, beta,
1812                                  dptr);CHKERRCUSPARSE(stat);
1813       }
1814     }
1815     cerr = WaitForGPU();CHKERRCUDA(cerr);
1816     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1817 
1818     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1819       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1820         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1821           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1822         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1823           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1824         }
1825       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1826         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1827       }
1828 
1829       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1830       if (compressed) {
1831         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1832 
1833         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1834         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1835                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1836                          VecCUDAPlusEquals());
1837         cerr = WaitForGPU();CHKERRCUDA(cerr);
1838         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1839       }
1840     } else {
1841       if (yy && yy != zz) {
1842         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1843       }
1844     }
1845     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1846     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1847     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1848   } catch(char *ex) {
1849     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1850   }
1851   if (yy) {
1852     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1853   } else {
1854     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1855   }
1856   PetscFunctionReturn(0);
1857 }
1858 
1859 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1860 {
1861   PetscErrorCode ierr;
1862 
1863   PetscFunctionBegin;
1864   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1865   PetscFunctionReturn(0);
1866 }
1867 
1868 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1869 {
1870   PetscErrorCode ierr;
1871 
1872   PetscFunctionBegin;
1873   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1874   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1875   if (A->factortype == MAT_FACTOR_NONE) {
1876     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1877   }
1878   PetscFunctionReturn(0);
1879 }
1880 
1881 /* --------------------------------------------------------------------------------*/
1882 /*@
1883    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1884    (the default parallel PETSc format). This matrix will ultimately pushed down
1885    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1886    assembly performance the user should preallocate the matrix storage by setting
1887    the parameter nz (or the array nnz).  By setting these parameters accurately,
1888    performance during matrix assembly can be increased by more than a factor of 50.
1889 
1890    Collective
1891 
1892    Input Parameters:
1893 +  comm - MPI communicator, set to PETSC_COMM_SELF
1894 .  m - number of rows
1895 .  n - number of columns
1896 .  nz - number of nonzeros per row (same for all rows)
1897 -  nnz - array containing the number of nonzeros in the various rows
1898          (possibly different for each row) or NULL
1899 
1900    Output Parameter:
1901 .  A - the matrix
1902 
1903    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1904    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1905    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1906 
1907    Notes:
1908    If nnz is given then nz is ignored
1909 
1910    The AIJ format (also called the Yale sparse matrix format or
1911    compressed row storage), is fully compatible with standard Fortran 77
1912    storage.  That is, the stored row and column indices can begin at
1913    either one (as in Fortran) or zero.  See the users' manual for details.
1914 
1915    Specify the preallocated storage with either nz or nnz (not both).
1916    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1917    allocation.  For large problems you MUST preallocate memory or you
1918    will get TERRIBLE performance, see the users' manual chapter on matrices.
1919 
1920    By default, this format uses inodes (identical nodes) when possible, to
1921    improve numerical efficiency of matrix-vector products and solves. We
1922    search for consecutive rows with the same nonzero structure, thereby
1923    reusing matrix information to achieve increased efficiency.
1924 
1925    Level: intermediate
1926 
1927 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1928 @*/
1929 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1930 {
1931   PetscErrorCode ierr;
1932 
1933   PetscFunctionBegin;
1934   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1935   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1936   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1937   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1938   PetscFunctionReturn(0);
1939 }
1940 
1941 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1942 {
1943   PetscErrorCode ierr;
1944 
1945   PetscFunctionBegin;
1946   if (A->factortype == MAT_FACTOR_NONE) {
1947     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1948   } else {
1949     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1950   }
1951   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1952   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1953   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1954   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1955   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1956   PetscFunctionReturn(0);
1957 }
1958 
1959 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1960 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1961 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1962 {
1963   PetscErrorCode ierr;
1964 
1965   PetscFunctionBegin;
1966   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1967   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1968   PetscFunctionReturn(0);
1969 }
1970 
1971 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1972 {
1973   PetscErrorCode ierr;
1974 
1975   PetscFunctionBegin;
1976   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1977   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1978      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1979      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
1980      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1981            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1982   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1983   /* TODO: add support for this? */
1984   if (flg) {
1985     A->ops->mult                      = MatMult_SeqAIJ;
1986     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1987     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1988     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1989     A->ops->multhermitiantranspose    = NULL;
1990     A->ops->multhermitiantransposeadd = NULL;
1991     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1992     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1993   } else {
1994     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
1995     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
1996     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
1997     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1998     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
1999     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2000     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2001     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2002   }
2003   A->boundtocpu = flg;
2004   PetscFunctionReturn(0);
2005 }
2006 
2007 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2008 {
2009   PetscErrorCode   ierr;
2010   cusparseStatus_t stat;
2011   Mat              B;
2012 
2013   PetscFunctionBegin;
2014   if (reuse == MAT_INITIAL_MATRIX) {
2015     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2016   } else if (reuse == MAT_REUSE_MATRIX) {
2017     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2018   }
2019   B = *newmat;
2020 
2021   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2022   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2023 
2024   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2025     if (B->factortype == MAT_FACTOR_NONE) {
2026       Mat_SeqAIJCUSPARSE *spptr;
2027 
2028       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2029       spptr->format = MAT_CUSPARSE_CSR;
2030       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2031       B->spptr = spptr;
2032     } else {
2033       Mat_SeqAIJCUSPARSETriFactors *spptr;
2034 
2035       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2036       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2037       B->spptr = spptr;
2038     }
2039     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2040   }
2041   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2042   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2043   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2044   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2045   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2046 
2047   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2048   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2049   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2050   PetscFunctionReturn(0);
2051 }
2052 
2053 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2054 {
2055   PetscErrorCode ierr;
2056 
2057   PetscFunctionBegin;
2058   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2059   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2060   PetscFunctionReturn(0);
2061 }
2062 
2063 /*MC
2064    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2065 
2066    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2067    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2068    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2069 
2070    Options Database Keys:
2071 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2072 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2073 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2074 
2075   Level: beginner
2076 
2077 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2078 M*/
2079 
2080 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2081 
2082 
2083 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2084 {
2085   PetscErrorCode ierr;
2086 
2087   PetscFunctionBegin;
2088   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2089   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2090   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2091   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2092   PetscFunctionReturn(0);
2093 }
2094 
2095 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2096 {
2097   PetscErrorCode   ierr;
2098   cusparseStatus_t stat;
2099   cusparseHandle_t handle;
2100 
2101   PetscFunctionBegin;
2102   if (*cusparsestruct) {
2103     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2104     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2105     delete (*cusparsestruct)->workVector;
2106     delete (*cusparsestruct)->rowoffsets_gpu;
2107     if (handle = (*cusparsestruct)->handle) {
2108       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2109     }
2110     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2111   }
2112   PetscFunctionReturn(0);
2113 }
2114 
2115 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2116 {
2117   PetscFunctionBegin;
2118   if (*mat) {
2119     delete (*mat)->values;
2120     delete (*mat)->column_indices;
2121     delete (*mat)->row_offsets;
2122     delete *mat;
2123     *mat = 0;
2124   }
2125   PetscFunctionReturn(0);
2126 }
2127 
2128 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2129 {
2130   cusparseStatus_t stat;
2131   PetscErrorCode   ierr;
2132 
2133   PetscFunctionBegin;
2134   if (*trifactor) {
2135     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2136     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2137     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2138     delete *trifactor;
2139     *trifactor = 0;
2140   }
2141   PetscFunctionReturn(0);
2142 }
2143 
2144 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2145 {
2146   CsrMatrix        *mat;
2147   cusparseStatus_t stat;
2148   cudaError_t      err;
2149 
2150   PetscFunctionBegin;
2151   if (*matstruct) {
2152     if ((*matstruct)->mat) {
2153       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2154         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2155         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2156       } else {
2157         mat = (CsrMatrix*)(*matstruct)->mat;
2158         CsrMatrix_Destroy(&mat);
2159       }
2160     }
2161     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2162     delete (*matstruct)->cprowIndices;
2163     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2164     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2165     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2166     delete *matstruct;
2167     *matstruct = 0;
2168   }
2169   PetscFunctionReturn(0);
2170 }
2171 
2172 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2173 {
2174   PetscErrorCode ierr;
2175 
2176   PetscFunctionBegin;
2177   if (*trifactors) {
2178     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2179     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2180     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2181     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2182     delete (*trifactors)->rpermIndices;
2183     delete (*trifactors)->cpermIndices;
2184     delete (*trifactors)->workVector;
2185     (*trifactors)->rpermIndices = 0;
2186     (*trifactors)->cpermIndices = 0;
2187     (*trifactors)->workVector = 0;
2188   }
2189   PetscFunctionReturn(0);
2190 }
2191 
2192 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2193 {
2194   PetscErrorCode   ierr;
2195   cusparseHandle_t handle;
2196   cusparseStatus_t stat;
2197 
2198   PetscFunctionBegin;
2199   if (*trifactors) {
2200     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2201     if (handle = (*trifactors)->handle) {
2202       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2203     }
2204     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2205   }
2206   PetscFunctionReturn(0);
2207 }
2208