xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 7f59fb53391d88fd5c171a44c948735e79dbf708)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127   switch (op) {
128   case MAT_CUSPARSE_MULT:
129     cusparsestruct->format = format;
130     break;
131   case MAT_CUSPARSE_ALL:
132     cusparsestruct->format = format;
133     break;
134   default:
135     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 /*@
141    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
142    operation. Only the MatMult operation can use different GPU storage formats
143    for MPIAIJCUSPARSE matrices.
144    Not Collective
145 
146    Input Parameters:
147 +  A - Matrix of type SEQAIJCUSPARSE
148 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
149 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
150 
151    Output Parameter:
152 
153    Level: intermediate
154 
155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
156 @*/
157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
158 {
159   PetscErrorCode ierr;
160 
161   PetscFunctionBegin;
162   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
163   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
164   PetscFunctionReturn(0);
165 }
166 
167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
168 {
169   PetscErrorCode           ierr;
170   MatCUSPARSEStorageFormat format;
171   PetscBool                flg;
172   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
173 
174   PetscFunctionBegin;
175   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
176   if (A->factortype==MAT_FACTOR_NONE) {
177     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
178                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179     if (flg) {
180       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
181     }
182   }
183   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
184                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185   if (flg) {
186     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
187   }
188   ierr = PetscOptionsTail();CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 
191 }
192 
193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
194 {
195   PetscErrorCode ierr;
196 
197   PetscFunctionBegin;
198   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
199   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
204 {
205   PetscErrorCode ierr;
206 
207   PetscFunctionBegin;
208   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
209   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
210   PetscFunctionReturn(0);
211 }
212 
213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
224 {
225   PetscErrorCode ierr;
226 
227   PetscFunctionBegin;
228   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
229   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
230   PetscFunctionReturn(0);
231 }
232 
233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
234 {
235   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
236   PetscInt                          n = A->rmap->n;
237   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
238   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
239   cusparseStatus_t                  stat;
240   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
241   const MatScalar                   *aa = a->a,*v;
242   PetscInt                          *AiLo, *AjLo;
243   PetscScalar                       *AALo;
244   PetscInt                          i,nz, nzLower, offset, rowOffset;
245   PetscErrorCode                    ierr;
246 
247   PetscFunctionBegin;
248   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
249     try {
250       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
251       nzLower=n+ai[n]-ai[1];
252 
253       /* Allocate Space for the lower triangular matrix */
254       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
255       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
256       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
257 
258       /* Fill the lower triangular matrix */
259       AiLo[0]  = (PetscInt) 0;
260       AiLo[n]  = nzLower;
261       AjLo[0]  = (PetscInt) 0;
262       AALo[0]  = (MatScalar) 1.0;
263       v        = aa;
264       vi       = aj;
265       offset   = 1;
266       rowOffset= 1;
267       for (i=1; i<n; i++) {
268         nz = ai[i+1] - ai[i];
269         /* additional 1 for the term on the diagonal */
270         AiLo[i]    = rowOffset;
271         rowOffset += nz+1;
272 
273         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
274         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
275 
276         offset      += nz;
277         AjLo[offset] = (PetscInt) i;
278         AALo[offset] = (MatScalar) 1.0;
279         offset      += 1;
280 
281         v  += nz;
282         vi += nz;
283       }
284 
285       /* allocate space for the triangular factor information */
286       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
287 
288       /* Create the matrix description */
289       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
290       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
291       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
292       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
293       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
294 
295       /* Create the solve analysis information */
296       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
297 
298       /* set the operation */
299       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
300 
301       /* set the matrix */
302       loTriFactor->csrMat = new CsrMatrix;
303       loTriFactor->csrMat->num_rows = n;
304       loTriFactor->csrMat->num_cols = n;
305       loTriFactor->csrMat->num_entries = nzLower;
306 
307       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
308       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
309 
310       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
311       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
312 
313       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
314       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
315 
316       /* perform the solve analysis */
317       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
318                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
319                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
320                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
321 
322       /* assign the pointer. Is this really necessary? */
323       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
324 
325       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
326       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
327       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
328       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
329     } catch(char *ex) {
330       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
331     }
332   }
333   PetscFunctionReturn(0);
334 }
335 
336 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
337 {
338   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
339   PetscInt                          n = A->rmap->n;
340   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
341   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
342   cusparseStatus_t                  stat;
343   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
344   const MatScalar                   *aa = a->a,*v;
345   PetscInt                          *AiUp, *AjUp;
346   PetscScalar                       *AAUp;
347   PetscInt                          i,nz, nzUpper, offset;
348   PetscErrorCode                    ierr;
349 
350   PetscFunctionBegin;
351   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
352     try {
353       /* next, figure out the number of nonzeros in the upper triangular matrix. */
354       nzUpper = adiag[0]-adiag[n];
355 
356       /* Allocate Space for the upper triangular matrix */
357       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
358       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
359       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
360 
361       /* Fill the upper triangular matrix */
362       AiUp[0]=(PetscInt) 0;
363       AiUp[n]=nzUpper;
364       offset = nzUpper;
365       for (i=n-1; i>=0; i--) {
366         v  = aa + adiag[i+1] + 1;
367         vi = aj + adiag[i+1] + 1;
368 
369         /* number of elements NOT on the diagonal */
370         nz = adiag[i] - adiag[i+1]-1;
371 
372         /* decrement the offset */
373         offset -= (nz+1);
374 
375         /* first, set the diagonal elements */
376         AjUp[offset] = (PetscInt) i;
377         AAUp[offset] = (MatScalar)1./v[nz];
378         AiUp[i]      = AiUp[i+1] - (nz+1);
379 
380         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
381         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
382       }
383 
384       /* allocate space for the triangular factor information */
385       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
386 
387       /* Create the matrix description */
388       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
389       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
390       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
391       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
392       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
393 
394       /* Create the solve analysis information */
395       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
396 
397       /* set the operation */
398       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
399 
400       /* set the matrix */
401       upTriFactor->csrMat = new CsrMatrix;
402       upTriFactor->csrMat->num_rows = n;
403       upTriFactor->csrMat->num_cols = n;
404       upTriFactor->csrMat->num_entries = nzUpper;
405 
406       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
407       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
408 
409       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
410       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
411 
412       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
413       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
414 
415       /* perform the solve analysis */
416       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
417                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
418                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
419                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
420 
421       /* assign the pointer. Is this really necessary? */
422       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
423 
424       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
425       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
426       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
427       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
428     } catch(char *ex) {
429       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
430     }
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
436 {
437   PetscErrorCode               ierr;
438   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
439   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
440   IS                           isrow = a->row,iscol = a->icol;
441   PetscBool                    row_identity,col_identity;
442   const PetscInt               *r,*c;
443   PetscInt                     n = A->rmap->n;
444 
445   PetscFunctionBegin;
446   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
447   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
448 
449   cusparseTriFactors->workVector = new THRUSTARRAY(n);
450   cusparseTriFactors->nnz=a->nz;
451 
452   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
453   /*lower triangular indices */
454   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
455   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
456   if (!row_identity) {
457     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
458     cusparseTriFactors->rpermIndices->assign(r, r+n);
459   }
460   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
461 
462   /*upper triangular indices */
463   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
464   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
465   if (!col_identity) {
466     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
467     cusparseTriFactors->cpermIndices->assign(c, c+n);
468   }
469 
470   if(!row_identity && !col_identity) {
471     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
472   } else if(!row_identity) {
473     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
474   } else if(!col_identity) {
475     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
476   }
477 
478   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
479   PetscFunctionReturn(0);
480 }
481 
482 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
483 {
484   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
485   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
486   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
487   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
488   cusparseStatus_t                  stat;
489   PetscErrorCode                    ierr;
490   PetscInt                          *AiUp, *AjUp;
491   PetscScalar                       *AAUp;
492   PetscScalar                       *AALo;
493   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
494   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
495   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
496   const MatScalar                   *aa = b->a,*v;
497 
498   PetscFunctionBegin;
499   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
500     try {
501       /* Allocate Space for the upper triangular matrix */
502       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
503       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
504       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
505       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
506 
507       /* Fill the upper triangular matrix */
508       AiUp[0]=(PetscInt) 0;
509       AiUp[n]=nzUpper;
510       offset = 0;
511       for (i=0; i<n; i++) {
512         /* set the pointers */
513         v  = aa + ai[i];
514         vj = aj + ai[i];
515         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
516 
517         /* first, set the diagonal elements */
518         AjUp[offset] = (PetscInt) i;
519         AAUp[offset] = (MatScalar)1.0/v[nz];
520         AiUp[i]      = offset;
521         AALo[offset] = (MatScalar)1.0/v[nz];
522 
523         offset+=1;
524         if (nz>0) {
525           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
526           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
527           for (j=offset; j<offset+nz; j++) {
528             AAUp[j] = -AAUp[j];
529             AALo[j] = AAUp[j]/v[nz];
530           }
531           offset+=nz;
532         }
533       }
534 
535       /* allocate space for the triangular factor information */
536       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
537 
538       /* Create the matrix description */
539       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
540       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
541       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
542       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
543       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
544 
545       /* Create the solve analysis information */
546       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
547 
548       /* set the operation */
549       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
550 
551       /* set the matrix */
552       upTriFactor->csrMat = new CsrMatrix;
553       upTriFactor->csrMat->num_rows = A->rmap->n;
554       upTriFactor->csrMat->num_cols = A->cmap->n;
555       upTriFactor->csrMat->num_entries = a->nz;
556 
557       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
558       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
559 
560       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
561       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
562 
563       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
564       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
565 
566       /* perform the solve analysis */
567       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
568                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
569                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
570                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
571 
572       /* assign the pointer. Is this really necessary? */
573       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
574 
575       /* allocate space for the triangular factor information */
576       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
577 
578       /* Create the matrix description */
579       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
580       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
581       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
582       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
583       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
584 
585       /* Create the solve analysis information */
586       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
587 
588       /* set the operation */
589       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
590 
591       /* set the matrix */
592       loTriFactor->csrMat = new CsrMatrix;
593       loTriFactor->csrMat->num_rows = A->rmap->n;
594       loTriFactor->csrMat->num_cols = A->cmap->n;
595       loTriFactor->csrMat->num_entries = a->nz;
596 
597       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
598       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
599 
600       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
601       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
602 
603       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
604       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
605       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
606 
607       /* perform the solve analysis */
608       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
609                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
610                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
611                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
612 
613       /* assign the pointer. Is this really necessary? */
614       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
615 
616       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
617       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
618       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
619       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
620       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
621     } catch(char *ex) {
622       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
623     }
624   }
625   PetscFunctionReturn(0);
626 }
627 
628 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
629 {
630   PetscErrorCode               ierr;
631   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
632   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
633   IS                           ip = a->row;
634   const PetscInt               *rip;
635   PetscBool                    perm_identity;
636   PetscInt                     n = A->rmap->n;
637 
638   PetscFunctionBegin;
639   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
640   cusparseTriFactors->workVector = new THRUSTARRAY(n);
641   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
642 
643   /*lower triangular indices */
644   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
645   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
646   if (!perm_identity) {
647     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
648     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
649     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
650     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
651     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
652   }
653   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
654   PetscFunctionReturn(0);
655 }
656 
657 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
658 {
659   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
660   IS             isrow = b->row,iscol = b->col;
661   PetscBool      row_identity,col_identity;
662   PetscErrorCode ierr;
663 
664   PetscFunctionBegin;
665   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
666   /* determine which version of MatSolve needs to be used. */
667   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
668   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
669   if (row_identity && col_identity) {
670     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
671     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
672   } else {
673     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
674     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
675   }
676 
677   /* get the triangular factors */
678   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
679   PetscFunctionReturn(0);
680 }
681 
682 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
683 {
684   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
685   IS             ip = b->row;
686   PetscBool      perm_identity;
687   PetscErrorCode ierr;
688 
689   PetscFunctionBegin;
690   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
691 
692   /* determine which version of MatSolve needs to be used. */
693   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
694   if (perm_identity) {
695     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
696     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
697   } else {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
700   }
701 
702   /* get the triangular factors */
703   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
707 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
708 {
709   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
710   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
711   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
712   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
713   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
714   cusparseStatus_t                  stat;
715   cusparseIndexBase_t               indexBase;
716   cusparseMatrixType_t              matrixType;
717   cusparseFillMode_t                fillMode;
718   cusparseDiagType_t                diagType;
719 
720   PetscFunctionBegin;
721 
722   /*********************************************/
723   /* Now the Transpose of the Lower Tri Factor */
724   /*********************************************/
725 
726   /* allocate space for the transpose of the lower triangular factor */
727   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
728 
729   /* set the matrix descriptors of the lower triangular factor */
730   matrixType = cusparseGetMatType(loTriFactor->descr);
731   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
732   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
733     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
734   diagType = cusparseGetMatDiagType(loTriFactor->descr);
735 
736   /* Create the matrix description */
737   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
738   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
739   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
740   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
741   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
742 
743   /* Create the solve analysis information */
744   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
745 
746   /* set the operation */
747   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
748 
749   /* allocate GPU space for the CSC of the lower triangular factor*/
750   loTriFactorT->csrMat = new CsrMatrix;
751   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
752   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
753   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
754   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
755   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
756   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
757 
758   /* compute the transpose of the lower triangular factor, i.e. the CSC */
759   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
760                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
761                           loTriFactor->csrMat->values->data().get(),
762                           loTriFactor->csrMat->row_offsets->data().get(),
763                           loTriFactor->csrMat->column_indices->data().get(),
764                           loTriFactorT->csrMat->values->data().get(),
765                           loTriFactorT->csrMat->column_indices->data().get(),
766                           loTriFactorT->csrMat->row_offsets->data().get(),
767                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
768 
769   /* perform the solve analysis on the transposed matrix */
770   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
771                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
772                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
773                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
774                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
775 
776   /* assign the pointer. Is this really necessary? */
777   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
778 
779   /*********************************************/
780   /* Now the Transpose of the Upper Tri Factor */
781   /*********************************************/
782 
783   /* allocate space for the transpose of the upper triangular factor */
784   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
785 
786   /* set the matrix descriptors of the upper triangular factor */
787   matrixType = cusparseGetMatType(upTriFactor->descr);
788   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
789   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
790     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
791   diagType = cusparseGetMatDiagType(upTriFactor->descr);
792 
793   /* Create the matrix description */
794   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
795   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
796   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
797   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
798   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
799 
800   /* Create the solve analysis information */
801   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
802 
803   /* set the operation */
804   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
805 
806   /* allocate GPU space for the CSC of the upper triangular factor*/
807   upTriFactorT->csrMat = new CsrMatrix;
808   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
809   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
810   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
811   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
812   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
813   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
814 
815   /* compute the transpose of the upper triangular factor, i.e. the CSC */
816   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
817                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
818                           upTriFactor->csrMat->values->data().get(),
819                           upTriFactor->csrMat->row_offsets->data().get(),
820                           upTriFactor->csrMat->column_indices->data().get(),
821                           upTriFactorT->csrMat->values->data().get(),
822                           upTriFactorT->csrMat->column_indices->data().get(),
823                           upTriFactorT->csrMat->row_offsets->data().get(),
824                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
825 
826   /* perform the solve analysis on the transposed matrix */
827   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
828                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
829                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
830                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
831                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
832 
833   /* assign the pointer. Is this really necessary? */
834   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
835   PetscFunctionReturn(0);
836 }
837 
838 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
839 {
840   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
841   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
842   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
843   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
844   cusparseStatus_t             stat;
845   cusparseIndexBase_t          indexBase;
846   cudaError_t                  err;
847   PetscErrorCode               ierr;
848 
849   PetscFunctionBegin;
850 
851   /* allocate space for the triangular factor information */
852   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
853   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
854   indexBase = cusparseGetMatIndexBase(matstruct->descr);
855   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
856   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
857 
858   /* set alpha and beta */
859   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
860   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
861   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
862   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
863   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
864   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
865   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
866 
867   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
868     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
869     CsrMatrix *matrixT= new CsrMatrix;
870     matrixT->num_rows = A->cmap->n;
871     matrixT->num_cols = A->rmap->n;
872     matrixT->num_entries = a->nz;
873     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
874     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
875     matrixT->values = new THRUSTARRAY(a->nz);
876 
877     /* compute the transpose of the upper triangular factor, i.e. the CSC */
878     indexBase = cusparseGetMatIndexBase(matstruct->descr);
879     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
880                             matrix->num_cols, matrix->num_entries,
881                             matrix->values->data().get(),
882                             matrix->row_offsets->data().get(),
883                             matrix->column_indices->data().get(),
884                             matrixT->values->data().get(),
885                             matrixT->column_indices->data().get(),
886                             matrixT->row_offsets->data().get(),
887                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
888 
889     /* assign the pointer */
890     matstructT->mat = matrixT;
891     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
892   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
893     /* First convert HYB to CSR */
894     CsrMatrix *temp= new CsrMatrix;
895     temp->num_rows = A->rmap->n;
896     temp->num_cols = A->cmap->n;
897     temp->num_entries = a->nz;
898     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
899     temp->column_indices = new THRUSTINTARRAY32(a->nz);
900     temp->values = new THRUSTARRAY(a->nz);
901 
902 
903     stat = cusparse_hyb2csr(cusparsestruct->handle,
904                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
905                             temp->values->data().get(),
906                             temp->row_offsets->data().get(),
907                             temp->column_indices->data().get());CHKERRCUDA(stat);
908 
909     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
910     CsrMatrix *tempT= new CsrMatrix;
911     tempT->num_rows = A->rmap->n;
912     tempT->num_cols = A->cmap->n;
913     tempT->num_entries = a->nz;
914     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
915     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
916     tempT->values = new THRUSTARRAY(a->nz);
917 
918     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
919                             temp->num_cols, temp->num_entries,
920                             temp->values->data().get(),
921                             temp->row_offsets->data().get(),
922                             temp->column_indices->data().get(),
923                             tempT->values->data().get(),
924                             tempT->column_indices->data().get(),
925                             tempT->row_offsets->data().get(),
926                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
927 
928     /* Last, convert CSC to HYB */
929     cusparseHybMat_t hybMat;
930     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
931     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
932       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
933     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
934                             matstructT->descr, tempT->values->data().get(),
935                             tempT->row_offsets->data().get(),
936                             tempT->column_indices->data().get(),
937                             hybMat, 0, partition);CHKERRCUDA(stat);
938 
939     /* assign the pointer */
940     matstructT->mat = hybMat;
941     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
942 
943     /* delete temporaries */
944     if (tempT) {
945       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
946       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
947       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
948       delete (CsrMatrix*) tempT;
949     }
950     if (temp) {
951       if (temp->values) delete (THRUSTARRAY*) temp->values;
952       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
953       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
954       delete (CsrMatrix*) temp;
955     }
956   }
957   /* assign the compressed row indices */
958   matstructT->cprowIndices = new THRUSTINTARRAY;
959   matstructT->cprowIndices->resize(A->cmap->n);
960   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
961   /* assign the pointer */
962   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
963   PetscFunctionReturn(0);
964 }
965 
966 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
967 {
968   PetscInt                              n = xx->map->n;
969   const PetscScalar                     *barray;
970   PetscScalar                           *xarray;
971   thrust::device_ptr<const PetscScalar> bGPU;
972   thrust::device_ptr<PetscScalar>       xGPU;
973   cusparseStatus_t                      stat;
974   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
975   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
976   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
977   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
978   PetscErrorCode                        ierr;
979 
980   PetscFunctionBegin;
981   /* Analyze the matrix and create the transpose ... on the fly */
982   if (!loTriFactorT && !upTriFactorT) {
983     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
984     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
985     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
986   }
987 
988   /* Get the GPU pointers */
989   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
990   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
991   xGPU = thrust::device_pointer_cast(xarray);
992   bGPU = thrust::device_pointer_cast(barray);
993 
994   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
995   /* First, reorder with the row permutation */
996   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
997                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
998                xGPU);
999 
1000   /* First, solve U */
1001   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1002                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1003                         upTriFactorT->csrMat->values->data().get(),
1004                         upTriFactorT->csrMat->row_offsets->data().get(),
1005                         upTriFactorT->csrMat->column_indices->data().get(),
1006                         upTriFactorT->solveInfo,
1007                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1008 
1009   /* Then, solve L */
1010   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1011                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1012                         loTriFactorT->csrMat->values->data().get(),
1013                         loTriFactorT->csrMat->row_offsets->data().get(),
1014                         loTriFactorT->csrMat->column_indices->data().get(),
1015                         loTriFactorT->solveInfo,
1016                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1017 
1018   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1019   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1020                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1021                tempGPU->begin());
1022 
1023   /* Copy the temporary to the full solution. */
1024   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1025   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1026 
1027   /* restore */
1028   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1029   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1030   ierr = WaitForGPU();CHKERRCUDA(ierr);
1031   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1036 {
1037   const PetscScalar                 *barray;
1038   PetscScalar                       *xarray;
1039   cusparseStatus_t                  stat;
1040   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1041   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1042   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1043   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1044   PetscErrorCode                    ierr;
1045 
1046   PetscFunctionBegin;
1047   /* Analyze the matrix and create the transpose ... on the fly */
1048   if (!loTriFactorT && !upTriFactorT) {
1049     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1050     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1051     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1052   }
1053 
1054   /* Get the GPU pointers */
1055   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1056   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1057 
1058   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1059   /* First, solve U */
1060   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1061                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1062                         upTriFactorT->csrMat->values->data().get(),
1063                         upTriFactorT->csrMat->row_offsets->data().get(),
1064                         upTriFactorT->csrMat->column_indices->data().get(),
1065                         upTriFactorT->solveInfo,
1066                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1067 
1068   /* Then, solve L */
1069   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1070                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1071                         loTriFactorT->csrMat->values->data().get(),
1072                         loTriFactorT->csrMat->row_offsets->data().get(),
1073                         loTriFactorT->csrMat->column_indices->data().get(),
1074                         loTriFactorT->solveInfo,
1075                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1076   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1077 
1078   /* restore */
1079   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1080   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1081   ierr = WaitForGPU();CHKERRCUDA(ierr);
1082   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1083   PetscFunctionReturn(0);
1084 }
1085 
1086 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1087 {
1088   const PetscScalar                     *barray;
1089   PetscScalar                           *xarray;
1090   thrust::device_ptr<const PetscScalar> bGPU;
1091   thrust::device_ptr<PetscScalar>       xGPU;
1092   cusparseStatus_t                      stat;
1093   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1094   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1095   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1096   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1097   PetscErrorCode                        ierr;
1098 
1099   PetscFunctionBegin;
1100 
1101   /* Get the GPU pointers */
1102   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1103   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1104   xGPU = thrust::device_pointer_cast(xarray);
1105   bGPU = thrust::device_pointer_cast(barray);
1106 
1107   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1108   /* First, reorder with the row permutation */
1109   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1110                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1111                xGPU);
1112 
1113   /* Next, solve L */
1114   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1115                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1116                         loTriFactor->csrMat->values->data().get(),
1117                         loTriFactor->csrMat->row_offsets->data().get(),
1118                         loTriFactor->csrMat->column_indices->data().get(),
1119                         loTriFactor->solveInfo,
1120                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1121 
1122   /* Then, solve U */
1123   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1124                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1125                         upTriFactor->csrMat->values->data().get(),
1126                         upTriFactor->csrMat->row_offsets->data().get(),
1127                         upTriFactor->csrMat->column_indices->data().get(),
1128                         upTriFactor->solveInfo,
1129                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1130   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1131 
1132   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1133   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1134                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1135                tempGPU->begin());
1136 
1137   /* Copy the temporary to the full solution. */
1138   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1139 
1140   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1141   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1142   ierr = WaitForGPU();CHKERRCUDA(ierr);
1143   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1144   PetscFunctionReturn(0);
1145 }
1146 
1147 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1148 {
1149   const PetscScalar                 *barray;
1150   PetscScalar                       *xarray;
1151   cusparseStatus_t                  stat;
1152   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1153   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1154   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1155   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1156   PetscErrorCode                    ierr;
1157 
1158   PetscFunctionBegin;
1159   /* Get the GPU pointers */
1160   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1161   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1162 
1163   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1164   /* First, solve L */
1165   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1166                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1167                         loTriFactor->csrMat->values->data().get(),
1168                         loTriFactor->csrMat->row_offsets->data().get(),
1169                         loTriFactor->csrMat->column_indices->data().get(),
1170                         loTriFactor->solveInfo,
1171                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1172 
1173   /* Next, solve U */
1174   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1175                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1176                         upTriFactor->csrMat->values->data().get(),
1177                         upTriFactor->csrMat->row_offsets->data().get(),
1178                         upTriFactor->csrMat->column_indices->data().get(),
1179                         upTriFactor->solveInfo,
1180                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1181   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1182 
1183   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1184   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1185   ierr = WaitForGPU();CHKERRCUDA(ierr);
1186   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1191 {
1192 
1193   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1194   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1195   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1196   PetscInt                     m = A->rmap->n,*ii,*ridx;
1197   PetscErrorCode               ierr;
1198   cusparseStatus_t             stat;
1199   cudaError_t                  err;
1200 
1201   PetscFunctionBegin;
1202   if (A->pinnedtocpu) PetscFunctionReturn(0);
1203   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1204     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1205     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1206       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1207       /* copy values only */
1208       matrix->values->assign(a->a, a->a+a->nz);
1209       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1210     } else {
1211       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1212       try {
1213         cusparsestruct->nonzerorow=0;
1214         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1215 
1216         if (a->compressedrow.use) {
1217           m    = a->compressedrow.nrows;
1218           ii   = a->compressedrow.i;
1219           ridx = a->compressedrow.rindex;
1220         } else {
1221           /* Forcing compressed row on the GPU */
1222           int k=0;
1223           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1224           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1225           ii[0]=0;
1226           for (int j = 0; j<m; j++) {
1227             if ((a->i[j+1]-a->i[j])>0) {
1228               ii[k]  = a->i[j];
1229               ridx[k]= j;
1230               k++;
1231             }
1232           }
1233           ii[cusparsestruct->nonzerorow] = a->nz;
1234           m = cusparsestruct->nonzerorow;
1235         }
1236 
1237         /* allocate space for the triangular factor information */
1238         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1239         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1240         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1241         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1242 
1243         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1244         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1245         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1246         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1247         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1248         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1249         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1250 
1251         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1252         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1253           /* set the matrix */
1254           CsrMatrix *matrix= new CsrMatrix;
1255           matrix->num_rows = m;
1256           matrix->num_cols = A->cmap->n;
1257           matrix->num_entries = a->nz;
1258           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1259           matrix->row_offsets->assign(ii, ii + m+1);
1260 
1261           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1262           matrix->column_indices->assign(a->j, a->j+a->nz);
1263 
1264           matrix->values = new THRUSTARRAY(a->nz);
1265           matrix->values->assign(a->a, a->a+a->nz);
1266 
1267           /* assign the pointer */
1268           matstruct->mat = matrix;
1269 
1270         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1271           CsrMatrix *matrix= new CsrMatrix;
1272           matrix->num_rows = m;
1273           matrix->num_cols = A->cmap->n;
1274           matrix->num_entries = a->nz;
1275           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1276           matrix->row_offsets->assign(ii, ii + m+1);
1277 
1278           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1279           matrix->column_indices->assign(a->j, a->j+a->nz);
1280 
1281           matrix->values = new THRUSTARRAY(a->nz);
1282           matrix->values->assign(a->a, a->a+a->nz);
1283 
1284           cusparseHybMat_t hybMat;
1285           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1286           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1287             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1288           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1289               matstruct->descr, matrix->values->data().get(),
1290               matrix->row_offsets->data().get(),
1291               matrix->column_indices->data().get(),
1292               hybMat, 0, partition);CHKERRCUDA(stat);
1293           /* assign the pointer */
1294           matstruct->mat = hybMat;
1295 
1296           if (matrix) {
1297             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1298             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1299             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1300             delete (CsrMatrix*)matrix;
1301           }
1302         }
1303 
1304         /* assign the compressed row indices */
1305         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1306         matstruct->cprowIndices->assign(ridx,ridx+m);
1307         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1308 
1309         /* assign the pointer */
1310         cusparsestruct->mat = matstruct;
1311 
1312         if (!a->compressedrow.use) {
1313           ierr = PetscFree(ii);CHKERRQ(ierr);
1314           ierr = PetscFree(ridx);CHKERRQ(ierr);
1315         }
1316         cusparsestruct->workVector = new THRUSTARRAY(m);
1317       } catch(char *ex) {
1318         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1319       }
1320       cusparsestruct->nonzerostate = A->nonzerostate;
1321     }
1322     ierr = WaitForGPU();CHKERRCUDA(ierr);
1323     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1324     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1325   }
1326   PetscFunctionReturn(0);
1327 }
1328 
1329 struct VecCUDAPlusEquals
1330 {
1331   template <typename Tuple>
1332   __host__ __device__
1333   void operator()(Tuple t)
1334   {
1335     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1336   }
1337 };
1338 
1339 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1340 {
1341   PetscErrorCode ierr;
1342 
1343   PetscFunctionBegin;
1344   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1349 {
1350   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1351   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1352   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1353   const PetscScalar            *xarray;
1354   PetscScalar                  *yarray;
1355   PetscErrorCode               ierr;
1356   cusparseStatus_t             stat;
1357 
1358   PetscFunctionBegin;
1359   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1360   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1361   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1362   if (!matstructT) {
1363     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1364     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1365   }
1366   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1367   ierr = VecSet(yy,0);CHKERRQ(ierr);
1368   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1369 
1370   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1371   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1372     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1373     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1374                              mat->num_rows, mat->num_cols,
1375                              mat->num_entries, matstructT->alpha, matstructT->descr,
1376                              mat->values->data().get(), mat->row_offsets->data().get(),
1377                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1378                              yarray);CHKERRCUDA(stat);
1379   } else {
1380     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1381     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1382                              matstructT->alpha, matstructT->descr, hybMat,
1383                              xarray, matstructT->beta_zero,
1384                              yarray);CHKERRCUDA(stat);
1385   }
1386   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1387   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1388   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1389   if (!cusparsestruct->stream) {
1390     ierr = WaitForGPU();CHKERRCUDA(ierr);
1391   }
1392   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1393   PetscFunctionReturn(0);
1394 }
1395 
1396 
1397 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1398 {
1399   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1400   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1401   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1402   const PetscScalar            *xarray;
1403   PetscScalar                  *zarray,*dptr,*beta;
1404   PetscErrorCode               ierr;
1405   cusparseStatus_t             stat;
1406 
1407   PetscFunctionBegin;
1408   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1409   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1410   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1411   try {
1412     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1413     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1414     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1415     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1416 
1417     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1418     /* csr_spmv is multiply add */
1419     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1420       /* here we need to be careful to set the number of rows in the multiply to the
1421          number of compressed rows in the matrix ... which is equivalent to the
1422          size of the workVector */
1423       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1424       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1425                                mat->num_rows, mat->num_cols,
1426                                mat->num_entries, matstruct->alpha, matstruct->descr,
1427                                mat->values->data().get(), mat->row_offsets->data().get(),
1428                                mat->column_indices->data().get(), xarray, beta,
1429                                dptr);CHKERRCUDA(stat);
1430     } else {
1431       if (cusparsestruct->workVector->size()) {
1432 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1433         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1434                                  matstruct->alpha, matstruct->descr, hybMat,
1435                                  xarray, beta,
1436                                  dptr);CHKERRCUDA(stat);
1437       }
1438     }
1439     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1440 
1441     if (yy) {
1442       if (dptr != zarray) {
1443         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1444       } else if (zz != yy) {
1445         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1446       }
1447     } else if (dptr != zarray) {
1448       ierr = VecSet(zz,0);CHKERRQ(ierr);
1449     }
1450     /* scatter the data from the temporary into the full vector with a += operation */
1451     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1452     if (dptr != zarray) {
1453       thrust::device_ptr<PetscScalar> zptr;
1454 
1455       zptr = thrust::device_pointer_cast(zarray);
1456       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1457                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1458                        VecCUDAPlusEquals());
1459     }
1460     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1461     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1462     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1463   } catch(char *ex) {
1464     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1465   }
1466   if (!yy) { /* MatMult */
1467     if (!cusparsestruct->stream) {
1468       ierr = WaitForGPU();CHKERRCUDA(ierr);
1469     }
1470   }
1471   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1472   PetscFunctionReturn(0);
1473 }
1474 
1475 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1476 {
1477   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1478   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1479   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1480   thrust::device_ptr<PetscScalar> zptr;
1481   const PetscScalar               *xarray;
1482   PetscScalar                     *zarray;
1483   PetscErrorCode                  ierr;
1484   cusparseStatus_t                stat;
1485 
1486   PetscFunctionBegin;
1487   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1488   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1489   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1490   if (!matstructT) {
1491     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1492     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1493   }
1494 
1495   try {
1496     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1497     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1498     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1499     zptr = thrust::device_pointer_cast(zarray);
1500 
1501     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1502     /* multiply add with matrix transpose */
1503     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1504       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1505       /* here we need to be careful to set the number of rows in the multiply to the
1506          number of compressed rows in the matrix ... which is equivalent to the
1507          size of the workVector */
1508       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1509                                mat->num_rows, mat->num_cols,
1510                                mat->num_entries, matstructT->alpha, matstructT->descr,
1511                                mat->values->data().get(), mat->row_offsets->data().get(),
1512                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1513                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1514     } else {
1515       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1516       if (cusparsestruct->workVector->size()) {
1517         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1518             matstructT->alpha, matstructT->descr, hybMat,
1519             xarray, matstructT->beta_zero,
1520             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1521       }
1522     }
1523 
1524     /* scatter the data from the temporary into the full vector with a += operation */
1525     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1526         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1527         VecCUDAPlusEquals());
1528     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1529     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1530     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1531 
1532   } catch(char *ex) {
1533     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1534   }
1535   ierr = WaitForGPU();CHKERRCUDA(ierr);
1536   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1537   PetscFunctionReturn(0);
1538 }
1539 
1540 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1541 {
1542   PetscErrorCode ierr;
1543 
1544   PetscFunctionBegin;
1545   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1546   if (A->factortype==MAT_FACTOR_NONE) {
1547     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1548   }
1549   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1550   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1551   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1552   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1553   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1554   PetscFunctionReturn(0);
1555 }
1556 
1557 /* --------------------------------------------------------------------------------*/
1558 /*@
1559    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1560    (the default parallel PETSc format). This matrix will ultimately pushed down
1561    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1562    assembly performance the user should preallocate the matrix storage by setting
1563    the parameter nz (or the array nnz).  By setting these parameters accurately,
1564    performance during matrix assembly can be increased by more than a factor of 50.
1565 
1566    Collective
1567 
1568    Input Parameters:
1569 +  comm - MPI communicator, set to PETSC_COMM_SELF
1570 .  m - number of rows
1571 .  n - number of columns
1572 .  nz - number of nonzeros per row (same for all rows)
1573 -  nnz - array containing the number of nonzeros in the various rows
1574          (possibly different for each row) or NULL
1575 
1576    Output Parameter:
1577 .  A - the matrix
1578 
1579    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1580    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1581    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1582 
1583    Notes:
1584    If nnz is given then nz is ignored
1585 
1586    The AIJ format (also called the Yale sparse matrix format or
1587    compressed row storage), is fully compatible with standard Fortran 77
1588    storage.  That is, the stored row and column indices can begin at
1589    either one (as in Fortran) or zero.  See the users' manual for details.
1590 
1591    Specify the preallocated storage with either nz or nnz (not both).
1592    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1593    allocation.  For large problems you MUST preallocate memory or you
1594    will get TERRIBLE performance, see the users' manual chapter on matrices.
1595 
1596    By default, this format uses inodes (identical nodes) when possible, to
1597    improve numerical efficiency of matrix-vector products and solves. We
1598    search for consecutive rows with the same nonzero structure, thereby
1599    reusing matrix information to achieve increased efficiency.
1600 
1601    Level: intermediate
1602 
1603 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1604 @*/
1605 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1606 {
1607   PetscErrorCode ierr;
1608 
1609   PetscFunctionBegin;
1610   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1611   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1612   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1613   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1618 {
1619   PetscErrorCode   ierr;
1620 
1621   PetscFunctionBegin;
1622   if (A->factortype==MAT_FACTOR_NONE) {
1623     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1624       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1625     }
1626   } else {
1627     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1628   }
1629   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1634 {
1635   PetscErrorCode ierr;
1636   Mat C;
1637   cusparseStatus_t stat;
1638   cusparseHandle_t handle=0;
1639 
1640   PetscFunctionBegin;
1641   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1642   C    = *B;
1643   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1644   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1645 
1646   /* inject CUSPARSE-specific stuff */
1647   if (C->factortype==MAT_FACTOR_NONE) {
1648     /* you cannot check the inode.use flag here since the matrix was just created.
1649        now build a GPU matrix data structure */
1650     C->spptr = new Mat_SeqAIJCUSPARSE;
1651     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1652     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1653     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1654     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1655     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1656     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1657     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1658     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1659   } else {
1660     /* NEXT, set the pointers to the triangular factors */
1661     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1662     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1663     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1664     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1665     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1666     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1667     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1668     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1669     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1670     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1671     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1672     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1673   }
1674 
1675   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1676   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1677   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1678   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1679   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1680   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1681   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1682   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1683 
1684   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1685 
1686   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1687 
1688   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1693 {
1694   PetscErrorCode ierr;
1695   cusparseStatus_t stat;
1696   cusparseHandle_t handle=0;
1697 
1698   PetscFunctionBegin;
1699   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1700   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1701 
1702   if (B->factortype==MAT_FACTOR_NONE) {
1703     /* you cannot check the inode.use flag here since the matrix was just created.
1704        now build a GPU matrix data structure */
1705     B->spptr = new Mat_SeqAIJCUSPARSE;
1706     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1707     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1708     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1709     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1710     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1711     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1712     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1713     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1714   } else {
1715     /* NEXT, set the pointers to the triangular factors */
1716     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1717     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1718     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1719     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1724     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1725     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1726     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1727   }
1728 
1729   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1730   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1731   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1732   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1733   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1734   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1735   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1736   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1737 
1738   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1739 
1740   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1741 
1742   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1747 {
1748   PetscErrorCode ierr;
1749 
1750   PetscFunctionBegin;
1751   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1752   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 /*MC
1757    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1758 
1759    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1760    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1761    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1762 
1763    Options Database Keys:
1764 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1765 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1766 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1767 
1768   Level: beginner
1769 
1770 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1771 M*/
1772 
1773 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1774 
1775 
1776 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1777 {
1778   PetscErrorCode ierr;
1779 
1780   PetscFunctionBegin;
1781   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1782   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1783   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1784   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1785   PetscFunctionReturn(0);
1786 }
1787 
1788 
1789 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1790 {
1791   cusparseStatus_t stat;
1792   cusparseHandle_t handle;
1793 
1794   PetscFunctionBegin;
1795   if (*cusparsestruct) {
1796     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1797     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1798     delete (*cusparsestruct)->workVector;
1799     if (handle = (*cusparsestruct)->handle) {
1800       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1801     }
1802     delete *cusparsestruct;
1803     *cusparsestruct = 0;
1804   }
1805   PetscFunctionReturn(0);
1806 }
1807 
1808 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1809 {
1810   PetscFunctionBegin;
1811   if (*mat) {
1812     delete (*mat)->values;
1813     delete (*mat)->column_indices;
1814     delete (*mat)->row_offsets;
1815     delete *mat;
1816     *mat = 0;
1817   }
1818   PetscFunctionReturn(0);
1819 }
1820 
1821 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1822 {
1823   cusparseStatus_t stat;
1824   PetscErrorCode   ierr;
1825 
1826   PetscFunctionBegin;
1827   if (*trifactor) {
1828     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1829     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1830     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1831     delete *trifactor;
1832     *trifactor = 0;
1833   }
1834   PetscFunctionReturn(0);
1835 }
1836 
1837 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1838 {
1839   CsrMatrix        *mat;
1840   cusparseStatus_t stat;
1841   cudaError_t      err;
1842 
1843   PetscFunctionBegin;
1844   if (*matstruct) {
1845     if ((*matstruct)->mat) {
1846       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1847         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1848         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1849       } else {
1850         mat = (CsrMatrix*)(*matstruct)->mat;
1851         CsrMatrix_Destroy(&mat);
1852       }
1853     }
1854     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1855     delete (*matstruct)->cprowIndices;
1856     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1857     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1858     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1859     delete *matstruct;
1860     *matstruct = 0;
1861   }
1862   PetscFunctionReturn(0);
1863 }
1864 
1865 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1866 {
1867   cusparseHandle_t handle;
1868   cusparseStatus_t stat;
1869 
1870   PetscFunctionBegin;
1871   if (*trifactors) {
1872     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1873     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1874     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1875     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1876     delete (*trifactors)->rpermIndices;
1877     delete (*trifactors)->cpermIndices;
1878     delete (*trifactors)->workVector;
1879     if (handle = (*trifactors)->handle) {
1880       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1881     }
1882     delete *trifactors;
1883     *trifactors = 0;
1884   }
1885   PetscFunctionReturn(0);
1886 }
1887 
1888