xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 55e7fe800d976e85ed2b5cd8bfdef564daa37bd9)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189   if (flg) {
190     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 
195 }
196 
197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
223   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
233   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240   PetscInt                          n = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t                  stat;
244   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245   const MatScalar                   *aa = a->a,*v;
246   PetscInt                          *AiLo, *AjLo;
247   PetscScalar                       *AALo;
248   PetscInt                          i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode                    ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
278         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
332     } catch(char *ex) {
333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334     }
335   }
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340 {
341   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342   PetscInt                          n = A->rmap->n;
343   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345   cusparseStatus_t                  stat;
346   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347   const MatScalar                   *aa = a->a,*v;
348   PetscInt                          *AiUp, *AjUp;
349   PetscScalar                       *AAUp;
350   PetscInt                          i,nz, nzUpper, offset;
351   PetscErrorCode                    ierr;
352 
353   PetscFunctionBegin;
354   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
355     try {
356       /* next, figure out the number of nonzeros in the upper triangular matrix. */
357       nzUpper = adiag[0]-adiag[n];
358 
359       /* Allocate Space for the upper triangular matrix */
360       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
363 
364       /* Fill the upper triangular matrix */
365       AiUp[0]=(PetscInt) 0;
366       AiUp[n]=nzUpper;
367       offset = nzUpper;
368       for (i=n-1; i>=0; i--) {
369         v  = aa + adiag[i+1] + 1;
370         vi = aj + adiag[i+1] + 1;
371 
372         /* number of elements NOT on the diagonal */
373         nz = adiag[i] - adiag[i+1]-1;
374 
375         /* decrement the offset */
376         offset -= (nz+1);
377 
378         /* first, set the diagonal elements */
379         AjUp[offset] = (PetscInt) i;
380         AAUp[offset] = (MatScalar)1./v[nz];
381         AiUp[i]      = AiUp[i+1] - (nz+1);
382 
383         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
384         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
385       }
386 
387       /* allocate space for the triangular factor information */
388       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
389 
390       /* Create the matrix description */
391       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
396 
397       /* Create the solve analysis information */
398       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
399 
400       /* set the operation */
401       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
402 
403       /* set the matrix */
404       upTriFactor->csrMat = new CsrMatrix;
405       upTriFactor->csrMat->num_rows = n;
406       upTriFactor->csrMat->num_cols = n;
407       upTriFactor->csrMat->num_entries = nzUpper;
408 
409       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
411 
412       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
414 
415       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
417 
418       /* perform the solve analysis */
419       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
423 
424       /* assign the pointer. Is this really necessary? */
425       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
426 
427       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
430     } catch(char *ex) {
431       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
432     }
433   }
434   PetscFunctionReturn(0);
435 }
436 
437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
438 {
439   PetscErrorCode               ierr;
440   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
441   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
442   IS                           isrow = a->row,iscol = a->icol;
443   PetscBool                    row_identity,col_identity;
444   const PetscInt               *r,*c;
445   PetscInt                     n = A->rmap->n;
446 
447   PetscFunctionBegin;
448   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
449   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
450 
451   cusparseTriFactors->workVector = new THRUSTARRAY(n);
452   cusparseTriFactors->nnz=a->nz;
453 
454   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
455   /*lower triangular indices */
456   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
457   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
458   if (!row_identity) {
459     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460     cusparseTriFactors->rpermIndices->assign(r, r+n);
461   }
462   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
463 
464   /*upper triangular indices */
465   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
466   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
467   if (!col_identity) {
468     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469     cusparseTriFactors->cpermIndices->assign(c, c+n);
470   }
471   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
472   PetscFunctionReturn(0);
473 }
474 
475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
476 {
477   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
478   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
480   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
481   cusparseStatus_t                  stat;
482   PetscErrorCode                    ierr;
483   PetscInt                          *AiUp, *AjUp;
484   PetscScalar                       *AAUp;
485   PetscScalar                       *AALo;
486   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
487   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
488   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
489   const MatScalar                   *aa = b->a,*v;
490 
491   PetscFunctionBegin;
492   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
493     try {
494       /* Allocate Space for the upper triangular matrix */
495       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
496       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
497       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
498       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
499 
500       /* Fill the upper triangular matrix */
501       AiUp[0]=(PetscInt) 0;
502       AiUp[n]=nzUpper;
503       offset = 0;
504       for (i=0; i<n; i++) {
505         /* set the pointers */
506         v  = aa + ai[i];
507         vj = aj + ai[i];
508         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
509 
510         /* first, set the diagonal elements */
511         AjUp[offset] = (PetscInt) i;
512         AAUp[offset] = (MatScalar)1.0/v[nz];
513         AiUp[i]      = offset;
514         AALo[offset] = (MatScalar)1.0/v[nz];
515 
516         offset+=1;
517         if (nz>0) {
518           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
519           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
520           for (j=offset; j<offset+nz; j++) {
521             AAUp[j] = -AAUp[j];
522             AALo[j] = AAUp[j]/v[nz];
523           }
524           offset+=nz;
525         }
526       }
527 
528       /* allocate space for the triangular factor information */
529       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
530 
531       /* Create the matrix description */
532       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
533       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
534       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
535       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
536       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
537 
538       /* Create the solve analysis information */
539       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
540 
541       /* set the operation */
542       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
543 
544       /* set the matrix */
545       upTriFactor->csrMat = new CsrMatrix;
546       upTriFactor->csrMat->num_rows = A->rmap->n;
547       upTriFactor->csrMat->num_cols = A->cmap->n;
548       upTriFactor->csrMat->num_entries = a->nz;
549 
550       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
551       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
552 
553       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
554       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
555 
556       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
557       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
558 
559       /* perform the solve analysis */
560       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
561                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
562                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
563                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
564 
565       /* assign the pointer. Is this really necessary? */
566       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
567 
568       /* allocate space for the triangular factor information */
569       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
570 
571       /* Create the matrix description */
572       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
573       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
574       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
575       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
576       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
577 
578       /* Create the solve analysis information */
579       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
580 
581       /* set the operation */
582       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
583 
584       /* set the matrix */
585       loTriFactor->csrMat = new CsrMatrix;
586       loTriFactor->csrMat->num_rows = A->rmap->n;
587       loTriFactor->csrMat->num_cols = A->cmap->n;
588       loTriFactor->csrMat->num_entries = a->nz;
589 
590       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
591       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
592 
593       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
594       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
595 
596       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
597       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
598 
599       /* perform the solve analysis */
600       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
601                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
602                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
603                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
604 
605       /* assign the pointer. Is this really necessary? */
606       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
607 
608       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
609       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
610       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
611       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
612       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
613     } catch(char *ex) {
614       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
615     }
616   }
617   PetscFunctionReturn(0);
618 }
619 
620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
621 {
622   PetscErrorCode               ierr;
623   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
624   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625   IS                           ip = a->row;
626   const PetscInt               *rip;
627   PetscBool                    perm_identity;
628   PetscInt                     n = A->rmap->n;
629 
630   PetscFunctionBegin;
631   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
632   cusparseTriFactors->workVector = new THRUSTARRAY(n);
633   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
634 
635   /*lower triangular indices */
636   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
637   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
638   if (!perm_identity) {
639     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
640     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
641     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
642     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
643   }
644   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
645   PetscFunctionReturn(0);
646 }
647 
648 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
649 {
650   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
651   IS             isrow = b->row,iscol = b->col;
652   PetscBool      row_identity,col_identity;
653   PetscErrorCode ierr;
654 
655   PetscFunctionBegin;
656   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
657   /* determine which version of MatSolve needs to be used. */
658   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
659   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
660   if (row_identity && col_identity) {
661     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
662     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
663   } else {
664     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
665     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
666   }
667 
668   /* get the triangular factors */
669   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
670   PetscFunctionReturn(0);
671 }
672 
673 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
674 {
675   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
676   IS             ip = b->row;
677   PetscBool      perm_identity;
678   PetscErrorCode ierr;
679 
680   PetscFunctionBegin;
681   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
682 
683   /* determine which version of MatSolve needs to be used. */
684   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
685   if (perm_identity) {
686     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
687     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
688   } else {
689     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
690     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
691   }
692 
693   /* get the triangular factors */
694   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
699 {
700   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
701   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
702   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
704   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
705   cusparseStatus_t                  stat;
706   cusparseIndexBase_t               indexBase;
707   cusparseMatrixType_t              matrixType;
708   cusparseFillMode_t                fillMode;
709   cusparseDiagType_t                diagType;
710 
711   PetscFunctionBegin;
712 
713   /*********************************************/
714   /* Now the Transpose of the Lower Tri Factor */
715   /*********************************************/
716 
717   /* allocate space for the transpose of the lower triangular factor */
718   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
719 
720   /* set the matrix descriptors of the lower triangular factor */
721   matrixType = cusparseGetMatType(loTriFactor->descr);
722   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
723   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
724     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
725   diagType = cusparseGetMatDiagType(loTriFactor->descr);
726 
727   /* Create the matrix description */
728   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
729   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
730   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
731   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
732   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
733 
734   /* Create the solve analysis information */
735   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
736 
737   /* set the operation */
738   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
739 
740   /* allocate GPU space for the CSC of the lower triangular factor*/
741   loTriFactorT->csrMat = new CsrMatrix;
742   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
743   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
744   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
745   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
746   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
747   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
748 
749   /* compute the transpose of the lower triangular factor, i.e. the CSC */
750   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
751                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
752                           loTriFactor->csrMat->values->data().get(),
753                           loTriFactor->csrMat->row_offsets->data().get(),
754                           loTriFactor->csrMat->column_indices->data().get(),
755                           loTriFactorT->csrMat->values->data().get(),
756                           loTriFactorT->csrMat->column_indices->data().get(),
757                           loTriFactorT->csrMat->row_offsets->data().get(),
758                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
759 
760   /* perform the solve analysis on the transposed matrix */
761   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
762                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
763                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
764                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
765                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
766 
767   /* assign the pointer. Is this really necessary? */
768   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
769 
770   /*********************************************/
771   /* Now the Transpose of the Upper Tri Factor */
772   /*********************************************/
773 
774   /* allocate space for the transpose of the upper triangular factor */
775   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
776 
777   /* set the matrix descriptors of the upper triangular factor */
778   matrixType = cusparseGetMatType(upTriFactor->descr);
779   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
780   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
781     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
782   diagType = cusparseGetMatDiagType(upTriFactor->descr);
783 
784   /* Create the matrix description */
785   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
786   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
787   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
788   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
789   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
790 
791   /* Create the solve analysis information */
792   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
793 
794   /* set the operation */
795   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
796 
797   /* allocate GPU space for the CSC of the upper triangular factor*/
798   upTriFactorT->csrMat = new CsrMatrix;
799   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
800   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
801   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
802   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
803   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
804   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
805 
806   /* compute the transpose of the upper triangular factor, i.e. the CSC */
807   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
808                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
809                           upTriFactor->csrMat->values->data().get(),
810                           upTriFactor->csrMat->row_offsets->data().get(),
811                           upTriFactor->csrMat->column_indices->data().get(),
812                           upTriFactorT->csrMat->values->data().get(),
813                           upTriFactorT->csrMat->column_indices->data().get(),
814                           upTriFactorT->csrMat->row_offsets->data().get(),
815                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
816 
817   /* perform the solve analysis on the transposed matrix */
818   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
819                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
820                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
821                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
822                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
823 
824   /* assign the pointer. Is this really necessary? */
825   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
826   PetscFunctionReturn(0);
827 }
828 
829 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
830 {
831   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
832   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
833   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
834   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
835   cusparseStatus_t             stat;
836   cusparseIndexBase_t          indexBase;
837   cudaError_t                  err;
838 
839   PetscFunctionBegin;
840 
841   /* allocate space for the triangular factor information */
842   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
843   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
844   indexBase = cusparseGetMatIndexBase(matstruct->descr);
845   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
846   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
847 
848   /* set alpha and beta */
849   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
850   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
851   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
852   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
853   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
855   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
856 
857   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
858     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
859     CsrMatrix *matrixT= new CsrMatrix;
860     matrixT->num_rows = A->cmap->n;
861     matrixT->num_cols = A->rmap->n;
862     matrixT->num_entries = a->nz;
863     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
864     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
865     matrixT->values = new THRUSTARRAY(a->nz);
866 
867     /* compute the transpose of the upper triangular factor, i.e. the CSC */
868     indexBase = cusparseGetMatIndexBase(matstruct->descr);
869     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
870                             matrix->num_cols, matrix->num_entries,
871                             matrix->values->data().get(),
872                             matrix->row_offsets->data().get(),
873                             matrix->column_indices->data().get(),
874                             matrixT->values->data().get(),
875                             matrixT->column_indices->data().get(),
876                             matrixT->row_offsets->data().get(),
877                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
878 
879     /* assign the pointer */
880     matstructT->mat = matrixT;
881 
882   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
883 #if CUDA_VERSION>=5000
884     /* First convert HYB to CSR */
885     CsrMatrix *temp= new CsrMatrix;
886     temp->num_rows = A->rmap->n;
887     temp->num_cols = A->cmap->n;
888     temp->num_entries = a->nz;
889     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
890     temp->column_indices = new THRUSTINTARRAY32(a->nz);
891     temp->values = new THRUSTARRAY(a->nz);
892 
893 
894     stat = cusparse_hyb2csr(cusparsestruct->handle,
895                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
896                             temp->values->data().get(),
897                             temp->row_offsets->data().get(),
898                             temp->column_indices->data().get());CHKERRCUDA(stat);
899 
900     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
901     CsrMatrix *tempT= new CsrMatrix;
902     tempT->num_rows = A->rmap->n;
903     tempT->num_cols = A->cmap->n;
904     tempT->num_entries = a->nz;
905     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
906     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
907     tempT->values = new THRUSTARRAY(a->nz);
908 
909     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
910                             temp->num_cols, temp->num_entries,
911                             temp->values->data().get(),
912                             temp->row_offsets->data().get(),
913                             temp->column_indices->data().get(),
914                             tempT->values->data().get(),
915                             tempT->column_indices->data().get(),
916                             tempT->row_offsets->data().get(),
917                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
918 
919     /* Last, convert CSC to HYB */
920     cusparseHybMat_t hybMat;
921     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
922     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
923       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
924     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
925                             matstructT->descr, tempT->values->data().get(),
926                             tempT->row_offsets->data().get(),
927                             tempT->column_indices->data().get(),
928                             hybMat, 0, partition);CHKERRCUDA(stat);
929 
930     /* assign the pointer */
931     matstructT->mat = hybMat;
932 
933     /* delete temporaries */
934     if (tempT) {
935       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
936       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
937       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
938       delete (CsrMatrix*) tempT;
939     }
940     if (temp) {
941       if (temp->values) delete (THRUSTARRAY*) temp->values;
942       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
943       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
944       delete (CsrMatrix*) temp;
945     }
946 #else
947     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
948 #endif
949   }
950   /* assign the compressed row indices */
951   matstructT->cprowIndices = new THRUSTINTARRAY;
952   matstructT->cprowIndices->resize(A->cmap->n);
953   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
954 
955   /* assign the pointer */
956   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
961 {
962   PetscInt                              n = xx->map->n;
963   const PetscScalar                     *barray;
964   PetscScalar                           *xarray;
965   thrust::device_ptr<const PetscScalar> bGPU;
966   thrust::device_ptr<PetscScalar>       xGPU;
967   cusparseStatus_t                      stat;
968   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
969   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
970   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
971   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
972   PetscErrorCode                        ierr;
973 
974   PetscFunctionBegin;
975   /* Analyze the matrix and create the transpose ... on the fly */
976   if (!loTriFactorT && !upTriFactorT) {
977     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
978     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
979     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
980   }
981 
982   /* Get the GPU pointers */
983   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
984   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
985   xGPU = thrust::device_pointer_cast(xarray);
986   bGPU = thrust::device_pointer_cast(barray);
987 
988   /* First, reorder with the row permutation */
989   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
990                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
991                xGPU);
992 
993   /* First, solve U */
994   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
995                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
996                         upTriFactorT->csrMat->values->data().get(),
997                         upTriFactorT->csrMat->row_offsets->data().get(),
998                         upTriFactorT->csrMat->column_indices->data().get(),
999                         upTriFactorT->solveInfo,
1000                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1001 
1002   /* Then, solve L */
1003   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1004                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1005                         loTriFactorT->csrMat->values->data().get(),
1006                         loTriFactorT->csrMat->row_offsets->data().get(),
1007                         loTriFactorT->csrMat->column_indices->data().get(),
1008                         loTriFactorT->solveInfo,
1009                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1010 
1011   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1012   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1013                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1014                tempGPU->begin());
1015 
1016   /* Copy the temporary to the full solution. */
1017   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1018 
1019   /* restore */
1020   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1021   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1022   ierr = WaitForGPU();CHKERRCUDA(ierr);
1023 
1024   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1029 {
1030   const PetscScalar                 *barray;
1031   PetscScalar                       *xarray;
1032   cusparseStatus_t                  stat;
1033   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1034   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1035   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1036   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1037   PetscErrorCode                    ierr;
1038 
1039   PetscFunctionBegin;
1040   /* Analyze the matrix and create the transpose ... on the fly */
1041   if (!loTriFactorT && !upTriFactorT) {
1042     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1043     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1044     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1045   }
1046 
1047   /* Get the GPU pointers */
1048   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1049   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1050 
1051   /* First, solve U */
1052   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1053                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1054                         upTriFactorT->csrMat->values->data().get(),
1055                         upTriFactorT->csrMat->row_offsets->data().get(),
1056                         upTriFactorT->csrMat->column_indices->data().get(),
1057                         upTriFactorT->solveInfo,
1058                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1059 
1060   /* Then, solve L */
1061   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1062                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1063                         loTriFactorT->csrMat->values->data().get(),
1064                         loTriFactorT->csrMat->row_offsets->data().get(),
1065                         loTriFactorT->csrMat->column_indices->data().get(),
1066                         loTriFactorT->solveInfo,
1067                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1068 
1069   /* restore */
1070   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1071   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1072   ierr = WaitForGPU();CHKERRCUDA(ierr);
1073   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1074   PetscFunctionReturn(0);
1075 }
1076 
1077 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1078 {
1079   const PetscScalar                     *barray;
1080   PetscScalar                           *xarray;
1081   thrust::device_ptr<const PetscScalar> bGPU;
1082   thrust::device_ptr<PetscScalar>       xGPU;
1083   cusparseStatus_t                      stat;
1084   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1085   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1086   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1087   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1088   PetscErrorCode                        ierr;
1089 
1090   PetscFunctionBegin;
1091 
1092   /* Get the GPU pointers */
1093   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1094   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1095   xGPU = thrust::device_pointer_cast(xarray);
1096   bGPU = thrust::device_pointer_cast(barray);
1097 
1098   /* First, reorder with the row permutation */
1099   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1100                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1101                xGPU);
1102 
1103   /* Next, solve L */
1104   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1105                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1106                         loTriFactor->csrMat->values->data().get(),
1107                         loTriFactor->csrMat->row_offsets->data().get(),
1108                         loTriFactor->csrMat->column_indices->data().get(),
1109                         loTriFactor->solveInfo,
1110                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1111 
1112   /* Then, solve U */
1113   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1114                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1115                         upTriFactor->csrMat->values->data().get(),
1116                         upTriFactor->csrMat->row_offsets->data().get(),
1117                         upTriFactor->csrMat->column_indices->data().get(),
1118                         upTriFactor->solveInfo,
1119                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1120 
1121   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1122   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1123                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1124                tempGPU->begin());
1125 
1126   /* Copy the temporary to the full solution. */
1127   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1128 
1129   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1130   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1131   ierr = WaitForGPU();CHKERRCUDA(ierr);
1132   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1133   PetscFunctionReturn(0);
1134 }
1135 
1136 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1137 {
1138   const PetscScalar                 *barray;
1139   PetscScalar                       *xarray;
1140   cusparseStatus_t                  stat;
1141   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1142   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1143   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1144   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1145   PetscErrorCode                    ierr;
1146 
1147   PetscFunctionBegin;
1148   /* Get the GPU pointers */
1149   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1150   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1151 
1152   /* First, solve L */
1153   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1154                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1155                         loTriFactor->csrMat->values->data().get(),
1156                         loTriFactor->csrMat->row_offsets->data().get(),
1157                         loTriFactor->csrMat->column_indices->data().get(),
1158                         loTriFactor->solveInfo,
1159                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1160 
1161   /* Next, solve U */
1162   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1163                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1164                         upTriFactor->csrMat->values->data().get(),
1165                         upTriFactor->csrMat->row_offsets->data().get(),
1166                         upTriFactor->csrMat->column_indices->data().get(),
1167                         upTriFactor->solveInfo,
1168                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1169 
1170   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1171   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1172   ierr = WaitForGPU();CHKERRCUDA(ierr);
1173   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1178 {
1179 
1180   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1181   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1182   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1183   PetscInt                     m = A->rmap->n,*ii,*ridx;
1184   PetscErrorCode               ierr;
1185   cusparseStatus_t             stat;
1186   cudaError_t                  err;
1187 
1188   PetscFunctionBegin;
1189   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1190     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1191     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1192       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1193       /* copy values only */
1194       matrix->values->assign(a->a, a->a+a->nz);
1195     } else {
1196       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1197       try {
1198         cusparsestruct->nonzerorow=0;
1199         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1200 
1201         if (a->compressedrow.use) {
1202           m    = a->compressedrow.nrows;
1203           ii   = a->compressedrow.i;
1204           ridx = a->compressedrow.rindex;
1205         } else {
1206           /* Forcing compressed row on the GPU */
1207           int k=0;
1208           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1209           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1210           ii[0]=0;
1211           for (int j = 0; j<m; j++) {
1212             if ((a->i[j+1]-a->i[j])>0) {
1213               ii[k]  = a->i[j];
1214               ridx[k]= j;
1215               k++;
1216             }
1217           }
1218           ii[cusparsestruct->nonzerorow] = a->nz;
1219           m = cusparsestruct->nonzerorow;
1220         }
1221 
1222         /* allocate space for the triangular factor information */
1223         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1224         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1225         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1226         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1227 
1228         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1229         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1230         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1231         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1232         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1233         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1234         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1235 
1236         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1237         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1238           /* set the matrix */
1239           CsrMatrix *matrix= new CsrMatrix;
1240           matrix->num_rows = m;
1241           matrix->num_cols = A->cmap->n;
1242           matrix->num_entries = a->nz;
1243           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1244           matrix->row_offsets->assign(ii, ii + m+1);
1245 
1246           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1247           matrix->column_indices->assign(a->j, a->j+a->nz);
1248 
1249           matrix->values = new THRUSTARRAY(a->nz);
1250           matrix->values->assign(a->a, a->a+a->nz);
1251 
1252           /* assign the pointer */
1253           matstruct->mat = matrix;
1254 
1255         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1256 #if CUDA_VERSION>=4020
1257           CsrMatrix *matrix= new CsrMatrix;
1258           matrix->num_rows = m;
1259           matrix->num_cols = A->cmap->n;
1260           matrix->num_entries = a->nz;
1261           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1262           matrix->row_offsets->assign(ii, ii + m+1);
1263 
1264           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1265           matrix->column_indices->assign(a->j, a->j+a->nz);
1266 
1267           matrix->values = new THRUSTARRAY(a->nz);
1268           matrix->values->assign(a->a, a->a+a->nz);
1269 
1270           cusparseHybMat_t hybMat;
1271           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1272           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1273             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1274           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1275               matstruct->descr, matrix->values->data().get(),
1276               matrix->row_offsets->data().get(),
1277               matrix->column_indices->data().get(),
1278               hybMat, 0, partition);CHKERRCUDA(stat);
1279           /* assign the pointer */
1280           matstruct->mat = hybMat;
1281 
1282           if (matrix) {
1283             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1284             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1285             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1286             delete (CsrMatrix*)matrix;
1287           }
1288 #endif
1289         }
1290 
1291         /* assign the compressed row indices */
1292         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1293         matstruct->cprowIndices->assign(ridx,ridx+m);
1294 
1295         /* assign the pointer */
1296         cusparsestruct->mat = matstruct;
1297 
1298         if (!a->compressedrow.use) {
1299           ierr = PetscFree(ii);CHKERRQ(ierr);
1300           ierr = PetscFree(ridx);CHKERRQ(ierr);
1301         }
1302         cusparsestruct->workVector = new THRUSTARRAY(m);
1303       } catch(char *ex) {
1304         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1305       }
1306       cusparsestruct->nonzerostate = A->nonzerostate;
1307     }
1308     ierr = WaitForGPU();CHKERRCUDA(ierr);
1309     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1310     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1311   }
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 struct VecCUDAPlusEquals
1316 {
1317   template <typename Tuple>
1318   __host__ __device__
1319   void operator()(Tuple t)
1320   {
1321     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1322   }
1323 };
1324 
1325 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1326 {
1327   PetscErrorCode ierr;
1328 
1329   PetscFunctionBegin;
1330   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1331   PetscFunctionReturn(0);
1332 }
1333 
1334 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1335 {
1336   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1337   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1338   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1339   const PetscScalar            *xarray;
1340   PetscScalar                  *yarray;
1341   PetscErrorCode               ierr;
1342   cusparseStatus_t             stat;
1343 
1344   PetscFunctionBegin;
1345   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1346   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1347   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1348   if (!matstructT) {
1349     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1350     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1351   }
1352   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1353   ierr = VecSet(yy,0);CHKERRQ(ierr);
1354   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1355 
1356   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1357     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1358     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1359                              mat->num_rows, mat->num_cols,
1360                              mat->num_entries, matstructT->alpha, matstructT->descr,
1361                              mat->values->data().get(), mat->row_offsets->data().get(),
1362                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1363                              yarray);CHKERRCUDA(stat);
1364   } else {
1365 #if CUDA_VERSION>=4020
1366     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1367     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1368                              matstructT->alpha, matstructT->descr, hybMat,
1369                              xarray, matstructT->beta_zero,
1370                              yarray);CHKERRCUDA(stat);
1371 #endif
1372   }
1373   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1374   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1375   if (!cusparsestruct->stream) {
1376     ierr = WaitForGPU();CHKERRCUDA(ierr);
1377   }
1378   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 
1383 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1384 {
1385   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1386   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1387   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1388   const PetscScalar            *xarray;
1389   PetscScalar                  *zarray,*dptr,*beta;
1390   PetscErrorCode               ierr;
1391   cusparseStatus_t             stat;
1392 
1393   PetscFunctionBegin;
1394   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1395   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1396   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1397   try {
1398     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1399     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1400     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1401     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1402 
1403     /* csr_spmv is multiply add */
1404     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1405       /* here we need to be careful to set the number of rows in the multiply to the
1406          number of compressed rows in the matrix ... which is equivalent to the
1407          size of the workVector */
1408       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1409       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1410                                mat->num_rows, mat->num_cols,
1411                                mat->num_entries, matstruct->alpha, matstruct->descr,
1412                                mat->values->data().get(), mat->row_offsets->data().get(),
1413                                mat->column_indices->data().get(), xarray, beta,
1414                                dptr);CHKERRCUDA(stat);
1415     } else {
1416 #if CUDA_VERSION>=4020
1417       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1418       if (cusparsestruct->workVector->size()) {
1419         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1420                                  matstruct->alpha, matstruct->descr, hybMat,
1421                                  xarray, beta,
1422                                  dptr);CHKERRCUDA(stat);
1423       }
1424 #endif
1425     }
1426 
1427     if (yy) {
1428       if (dptr != zarray) {
1429         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1430       } else if (zz != yy) {
1431         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1432       }
1433     } else if (dptr != zarray) {
1434       ierr = VecSet(zz,0);CHKERRQ(ierr);
1435     }
1436     /* scatter the data from the temporary into the full vector with a += operation */
1437     if (dptr != zarray) {
1438       thrust::device_ptr<PetscScalar> zptr;
1439 
1440       zptr = thrust::device_pointer_cast(zarray);
1441       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1442                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1443                        VecCUDAPlusEquals());
1444     }
1445     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1446     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1447   } catch(char *ex) {
1448     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1449   }
1450   if (!yy) { /* MatMult */
1451     if (!cusparsestruct->stream) {
1452       ierr = WaitForGPU();CHKERRCUDA(ierr);
1453     }
1454   }
1455   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1460 {
1461   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1462   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1463   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1464   thrust::device_ptr<PetscScalar> zptr;
1465   const PetscScalar               *xarray;
1466   PetscScalar                     *zarray;
1467   PetscErrorCode                  ierr;
1468   cusparseStatus_t                stat;
1469 
1470   PetscFunctionBegin;
1471   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1472   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1473   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1474   if (!matstructT) {
1475     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1476     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1477   }
1478 
1479   try {
1480     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1481     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1482     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1483     zptr = thrust::device_pointer_cast(zarray);
1484 
1485     /* multiply add with matrix transpose */
1486     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1487       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1488       /* here we need to be careful to set the number of rows in the multiply to the
1489          number of compressed rows in the matrix ... which is equivalent to the
1490          size of the workVector */
1491       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1492                                mat->num_rows, mat->num_cols,
1493                                mat->num_entries, matstructT->alpha, matstructT->descr,
1494                                mat->values->data().get(), mat->row_offsets->data().get(),
1495                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1496                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1497     } else {
1498 #if CUDA_VERSION>=4020
1499       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1500       if (cusparsestruct->workVector->size()) {
1501         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1502             matstructT->alpha, matstructT->descr, hybMat,
1503             xarray, matstructT->beta_zero,
1504             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1505       }
1506 #endif
1507     }
1508 
1509     /* scatter the data from the temporary into the full vector with a += operation */
1510     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1511         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1512         VecCUDAPlusEquals());
1513 
1514     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1515     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1516 
1517   } catch(char *ex) {
1518     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1519   }
1520   ierr = WaitForGPU();CHKERRCUDA(ierr);
1521   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1522   PetscFunctionReturn(0);
1523 }
1524 
1525 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1526 {
1527   PetscErrorCode ierr;
1528 
1529   PetscFunctionBegin;
1530   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1531   if (A->factortype==MAT_FACTOR_NONE) {
1532     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1533   }
1534   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1535   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1536   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1537   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1538   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1539   PetscFunctionReturn(0);
1540 }
1541 
1542 /* --------------------------------------------------------------------------------*/
1543 /*@
1544    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1545    (the default parallel PETSc format). This matrix will ultimately pushed down
1546    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1547    assembly performance the user should preallocate the matrix storage by setting
1548    the parameter nz (or the array nnz).  By setting these parameters accurately,
1549    performance during matrix assembly can be increased by more than a factor of 50.
1550 
1551    Collective on MPI_Comm
1552 
1553    Input Parameters:
1554 +  comm - MPI communicator, set to PETSC_COMM_SELF
1555 .  m - number of rows
1556 .  n - number of columns
1557 .  nz - number of nonzeros per row (same for all rows)
1558 -  nnz - array containing the number of nonzeros in the various rows
1559          (possibly different for each row) or NULL
1560 
1561    Output Parameter:
1562 .  A - the matrix
1563 
1564    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1565    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1566    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1567 
1568    Notes:
1569    If nnz is given then nz is ignored
1570 
1571    The AIJ format (also called the Yale sparse matrix format or
1572    compressed row storage), is fully compatible with standard Fortran 77
1573    storage.  That is, the stored row and column indices can begin at
1574    either one (as in Fortran) or zero.  See the users' manual for details.
1575 
1576    Specify the preallocated storage with either nz or nnz (not both).
1577    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1578    allocation.  For large problems you MUST preallocate memory or you
1579    will get TERRIBLE performance, see the users' manual chapter on matrices.
1580 
1581    By default, this format uses inodes (identical nodes) when possible, to
1582    improve numerical efficiency of matrix-vector products and solves. We
1583    search for consecutive rows with the same nonzero structure, thereby
1584    reusing matrix information to achieve increased efficiency.
1585 
1586    Level: intermediate
1587 
1588 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1589 @*/
1590 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1591 {
1592   PetscErrorCode ierr;
1593 
1594   PetscFunctionBegin;
1595   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1596   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1597   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1598   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1599   PetscFunctionReturn(0);
1600 }
1601 
1602 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1603 {
1604   PetscErrorCode   ierr;
1605 
1606   PetscFunctionBegin;
1607   if (A->factortype==MAT_FACTOR_NONE) {
1608     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1609       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1610     }
1611   } else {
1612     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1613   }
1614   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1619 {
1620   PetscErrorCode ierr;
1621   Mat C;
1622   cusparseStatus_t stat;
1623   cusparseHandle_t handle=0;
1624 
1625   PetscFunctionBegin;
1626   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1627   C    = *B;
1628   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1629   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1630 
1631   /* inject CUSPARSE-specific stuff */
1632   if (C->factortype==MAT_FACTOR_NONE) {
1633     /* you cannot check the inode.use flag here since the matrix was just created.
1634        now build a GPU matrix data structure */
1635     C->spptr = new Mat_SeqAIJCUSPARSE;
1636     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1637     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1638     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1639     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1640     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1641     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = 0;
1642     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1643     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1644     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1645     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1646   } else {
1647     /* NEXT, set the pointers to the triangular factors */
1648     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1649     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1650     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1651     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1652     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1653     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1654     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1655     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1656     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1657     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1658     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1659     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1660   }
1661 
1662   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1663   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1664   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1665   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1666   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1667   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1668   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1669   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1670 
1671   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1672 
1673   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1674 
1675   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1680 {
1681   PetscErrorCode ierr;
1682   cusparseStatus_t stat;
1683   cusparseHandle_t handle=0;
1684 
1685   PetscFunctionBegin;
1686   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1687   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1688 
1689   if (B->factortype==MAT_FACTOR_NONE) {
1690     /* you cannot check the inode.use flag here since the matrix was just created.
1691        now build a GPU matrix data structure */
1692     B->spptr = new Mat_SeqAIJCUSPARSE;
1693     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1694     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1695     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1696     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1697     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1698     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1699     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1700     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1701     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1702     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1703   } else {
1704     /* NEXT, set the pointers to the triangular factors */
1705     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1706     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1707     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1708     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1709     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1710     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1711     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1712     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1713     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1714     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1715     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1716     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1717   }
1718 
1719   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1720   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1721   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1722   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1723   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1724   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1725   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1726   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1727 
1728   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1729 
1730   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1731 
1732   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1737 {
1738   PetscErrorCode ierr;
1739 
1740   PetscFunctionBegin;
1741   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1742   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 /*MC
1747    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1748 
1749    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1750    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1751    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1752 
1753    Options Database Keys:
1754 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1755 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1756 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1757 
1758   Level: beginner
1759 
1760 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1761 M*/
1762 
1763 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1764 
1765 
1766 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1767 {
1768   PetscErrorCode ierr;
1769 
1770   PetscFunctionBegin;
1771   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1772   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1773   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1774   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1775   PetscFunctionReturn(0);
1776 }
1777 
1778 
1779 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1780 {
1781   cusparseStatus_t stat;
1782   cusparseHandle_t handle;
1783 
1784   PetscFunctionBegin;
1785   if (*cusparsestruct) {
1786     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1787     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1788     delete (*cusparsestruct)->workVector;
1789     if (handle = (*cusparsestruct)->handle) {
1790       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1791     }
1792     delete *cusparsestruct;
1793     *cusparsestruct = 0;
1794   }
1795   PetscFunctionReturn(0);
1796 }
1797 
1798 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1799 {
1800   PetscFunctionBegin;
1801   if (*mat) {
1802     delete (*mat)->values;
1803     delete (*mat)->column_indices;
1804     delete (*mat)->row_offsets;
1805     delete *mat;
1806     *mat = 0;
1807   }
1808   PetscFunctionReturn(0);
1809 }
1810 
1811 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1812 {
1813   cusparseStatus_t stat;
1814   PetscErrorCode   ierr;
1815 
1816   PetscFunctionBegin;
1817   if (*trifactor) {
1818     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1819     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1820     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1821     delete *trifactor;
1822     *trifactor = 0;
1823   }
1824   PetscFunctionReturn(0);
1825 }
1826 
1827 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1828 {
1829   CsrMatrix        *mat;
1830   cusparseStatus_t stat;
1831   cudaError_t      err;
1832 
1833   PetscFunctionBegin;
1834   if (*matstruct) {
1835     if ((*matstruct)->mat) {
1836       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1837         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1838         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1839       } else {
1840         mat = (CsrMatrix*)(*matstruct)->mat;
1841         CsrMatrix_Destroy(&mat);
1842       }
1843     }
1844     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1845     delete (*matstruct)->cprowIndices;
1846     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1847     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1848     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1849     delete *matstruct;
1850     *matstruct = 0;
1851   }
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1856 {
1857   cusparseHandle_t handle;
1858   cusparseStatus_t stat;
1859 
1860   PetscFunctionBegin;
1861   if (*trifactors) {
1862     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1863     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1864     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1865     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1866     delete (*trifactors)->rpermIndices;
1867     delete (*trifactors)->cpermIndices;
1868     delete (*trifactors)->workVector;
1869     if (handle = (*trifactors)->handle) {
1870       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1871     }
1872     delete *trifactors;
1873     *trifactors = 0;
1874   }
1875   PetscFunctionReturn(0);
1876 }
1877 
1878