xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4992a29c8af173a30b29871e292067c816d21b83)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127   switch (op) {
128   case MAT_CUSPARSE_MULT:
129     cusparsestruct->format = format;
130     break;
131   case MAT_CUSPARSE_ALL:
132     cusparsestruct->format = format;
133     break;
134   default:
135     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 /*@
141    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
142    operation. Only the MatMult operation can use different GPU storage formats
143    for MPIAIJCUSPARSE matrices.
144    Not Collective
145 
146    Input Parameters:
147 +  A - Matrix of type SEQAIJCUSPARSE
148 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
149 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
150 
151    Output Parameter:
152 
153    Level: intermediate
154 
155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
156 @*/
157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
158 {
159   PetscErrorCode ierr;
160 
161   PetscFunctionBegin;
162   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
163   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
164   PetscFunctionReturn(0);
165 }
166 
167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
168 {
169   PetscErrorCode           ierr;
170   MatCUSPARSEStorageFormat format;
171   PetscBool                flg;
172   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
173 
174   PetscFunctionBegin;
175   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
176   if (A->factortype==MAT_FACTOR_NONE) {
177     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
178                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179     if (flg) {
180       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
181     }
182   }
183   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
184                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185   if (flg) {
186     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
187   }
188   ierr = PetscOptionsTail();CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 
191 }
192 
193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
194 {
195   PetscErrorCode ierr;
196 
197   PetscFunctionBegin;
198   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
199   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
204 {
205   PetscErrorCode ierr;
206 
207   PetscFunctionBegin;
208   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
209   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
210   PetscFunctionReturn(0);
211 }
212 
213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
224 {
225   PetscErrorCode ierr;
226 
227   PetscFunctionBegin;
228   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
229   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
230   PetscFunctionReturn(0);
231 }
232 
233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
234 {
235   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
236   PetscInt                          n = A->rmap->n;
237   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
238   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
239   cusparseStatus_t                  stat;
240   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
241   const MatScalar                   *aa = a->a,*v;
242   PetscInt                          *AiLo, *AjLo;
243   PetscScalar                       *AALo;
244   PetscInt                          i,nz, nzLower, offset, rowOffset;
245   PetscErrorCode                    ierr;
246 
247   PetscFunctionBegin;
248   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
249     try {
250       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
251       nzLower=n+ai[n]-ai[1];
252 
253       /* Allocate Space for the lower triangular matrix */
254       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
255       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
256       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
257 
258       /* Fill the lower triangular matrix */
259       AiLo[0]  = (PetscInt) 0;
260       AiLo[n]  = nzLower;
261       AjLo[0]  = (PetscInt) 0;
262       AALo[0]  = (MatScalar) 1.0;
263       v        = aa;
264       vi       = aj;
265       offset   = 1;
266       rowOffset= 1;
267       for (i=1; i<n; i++) {
268         nz = ai[i+1] - ai[i];
269         /* additional 1 for the term on the diagonal */
270         AiLo[i]    = rowOffset;
271         rowOffset += nz+1;
272 
273         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
274         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
275 
276         offset      += nz;
277         AjLo[offset] = (PetscInt) i;
278         AALo[offset] = (MatScalar) 1.0;
279         offset      += 1;
280 
281         v  += nz;
282         vi += nz;
283       }
284 
285       /* allocate space for the triangular factor information */
286       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
287 
288       /* Create the matrix description */
289       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
290       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
291       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
292       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
293       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
294 
295       /* Create the solve analysis information */
296       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
297 
298       /* set the operation */
299       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
300 
301       /* set the matrix */
302       loTriFactor->csrMat = new CsrMatrix;
303       loTriFactor->csrMat->num_rows = n;
304       loTriFactor->csrMat->num_cols = n;
305       loTriFactor->csrMat->num_entries = nzLower;
306 
307       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
308       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
309 
310       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
311       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
312 
313       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
314       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
315 
316       /* perform the solve analysis */
317       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
318                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
319                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
320                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
321 
322       /* assign the pointer. Is this really necessary? */
323       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
324 
325       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
326       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
327       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
328       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
329     } catch(char *ex) {
330       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
331     }
332   }
333   PetscFunctionReturn(0);
334 }
335 
336 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
337 {
338   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
339   PetscInt                          n = A->rmap->n;
340   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
341   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
342   cusparseStatus_t                  stat;
343   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
344   const MatScalar                   *aa = a->a,*v;
345   PetscInt                          *AiUp, *AjUp;
346   PetscScalar                       *AAUp;
347   PetscInt                          i,nz, nzUpper, offset;
348   PetscErrorCode                    ierr;
349 
350   PetscFunctionBegin;
351   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
352     try {
353       /* next, figure out the number of nonzeros in the upper triangular matrix. */
354       nzUpper = adiag[0]-adiag[n];
355 
356       /* Allocate Space for the upper triangular matrix */
357       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
358       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
359       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
360 
361       /* Fill the upper triangular matrix */
362       AiUp[0]=(PetscInt) 0;
363       AiUp[n]=nzUpper;
364       offset = nzUpper;
365       for (i=n-1; i>=0; i--) {
366         v  = aa + adiag[i+1] + 1;
367         vi = aj + adiag[i+1] + 1;
368 
369         /* number of elements NOT on the diagonal */
370         nz = adiag[i] - adiag[i+1]-1;
371 
372         /* decrement the offset */
373         offset -= (nz+1);
374 
375         /* first, set the diagonal elements */
376         AjUp[offset] = (PetscInt) i;
377         AAUp[offset] = (MatScalar)1./v[nz];
378         AiUp[i]      = AiUp[i+1] - (nz+1);
379 
380         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
381         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
382       }
383 
384       /* allocate space for the triangular factor information */
385       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
386 
387       /* Create the matrix description */
388       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
389       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
390       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
391       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
392       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
393 
394       /* Create the solve analysis information */
395       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
396 
397       /* set the operation */
398       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
399 
400       /* set the matrix */
401       upTriFactor->csrMat = new CsrMatrix;
402       upTriFactor->csrMat->num_rows = n;
403       upTriFactor->csrMat->num_cols = n;
404       upTriFactor->csrMat->num_entries = nzUpper;
405 
406       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
407       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
408 
409       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
410       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
411 
412       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
413       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
414 
415       /* perform the solve analysis */
416       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
417                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
418                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
419                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
420 
421       /* assign the pointer. Is this really necessary? */
422       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
423 
424       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
425       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
426       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
427       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
428     } catch(char *ex) {
429       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
430     }
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
436 {
437   PetscErrorCode               ierr;
438   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
439   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
440   IS                           isrow = a->row,iscol = a->icol;
441   PetscBool                    row_identity,col_identity;
442   const PetscInt               *r,*c;
443   PetscInt                     n = A->rmap->n;
444 
445   PetscFunctionBegin;
446   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
447   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
448 
449   cusparseTriFactors->workVector = new THRUSTARRAY(n);
450   cusparseTriFactors->nnz=a->nz;
451 
452   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
453   /*lower triangular indices */
454   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
455   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
456   if (!row_identity) {
457     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
458     cusparseTriFactors->rpermIndices->assign(r, r+n);
459   }
460   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
461 
462   /*upper triangular indices */
463   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
464   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
465   if (!col_identity) {
466     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
467     cusparseTriFactors->cpermIndices->assign(c, c+n);
468   }
469 
470   if(!row_identity && !col_identity) {
471     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
472   } else if(!row_identity) {
473     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
474   } else if(!col_identity) {
475     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
476   }
477 
478   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
479   PetscFunctionReturn(0);
480 }
481 
482 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
483 {
484   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
485   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
486   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
487   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
488   cusparseStatus_t                  stat;
489   PetscErrorCode                    ierr;
490   PetscInt                          *AiUp, *AjUp;
491   PetscScalar                       *AAUp;
492   PetscScalar                       *AALo;
493   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
494   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
495   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
496   const MatScalar                   *aa = b->a,*v;
497 
498   PetscFunctionBegin;
499   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
500     try {
501       /* Allocate Space for the upper triangular matrix */
502       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
503       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
504       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
505       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
506 
507       /* Fill the upper triangular matrix */
508       AiUp[0]=(PetscInt) 0;
509       AiUp[n]=nzUpper;
510       offset = 0;
511       for (i=0; i<n; i++) {
512         /* set the pointers */
513         v  = aa + ai[i];
514         vj = aj + ai[i];
515         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
516 
517         /* first, set the diagonal elements */
518         AjUp[offset] = (PetscInt) i;
519         AAUp[offset] = (MatScalar)1.0/v[nz];
520         AiUp[i]      = offset;
521         AALo[offset] = (MatScalar)1.0/v[nz];
522 
523         offset+=1;
524         if (nz>0) {
525           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
526           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
527           for (j=offset; j<offset+nz; j++) {
528             AAUp[j] = -AAUp[j];
529             AALo[j] = AAUp[j]/v[nz];
530           }
531           offset+=nz;
532         }
533       }
534 
535       /* allocate space for the triangular factor information */
536       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
537 
538       /* Create the matrix description */
539       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
540       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
541       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
542       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
543       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
544 
545       /* Create the solve analysis information */
546       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
547 
548       /* set the operation */
549       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
550 
551       /* set the matrix */
552       upTriFactor->csrMat = new CsrMatrix;
553       upTriFactor->csrMat->num_rows = A->rmap->n;
554       upTriFactor->csrMat->num_cols = A->cmap->n;
555       upTriFactor->csrMat->num_entries = a->nz;
556 
557       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
558       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
559 
560       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
561       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
562 
563       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
564       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
565 
566       /* perform the solve analysis */
567       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
568                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
569                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
570                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
571 
572       /* assign the pointer. Is this really necessary? */
573       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
574 
575       /* allocate space for the triangular factor information */
576       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
577 
578       /* Create the matrix description */
579       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
580       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
581       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
582       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
583       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
584 
585       /* Create the solve analysis information */
586       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
587 
588       /* set the operation */
589       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
590 
591       /* set the matrix */
592       loTriFactor->csrMat = new CsrMatrix;
593       loTriFactor->csrMat->num_rows = A->rmap->n;
594       loTriFactor->csrMat->num_cols = A->cmap->n;
595       loTriFactor->csrMat->num_entries = a->nz;
596 
597       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
598       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
599 
600       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
601       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
602 
603       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
604       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
605       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
606 
607       /* perform the solve analysis */
608       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
609                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
610                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
611                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
612 
613       /* assign the pointer. Is this really necessary? */
614       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
615 
616       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
617       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
618       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
619       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
620       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
621     } catch(char *ex) {
622       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
623     }
624   }
625   PetscFunctionReturn(0);
626 }
627 
628 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
629 {
630   PetscErrorCode               ierr;
631   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
632   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
633   IS                           ip = a->row;
634   const PetscInt               *rip;
635   PetscBool                    perm_identity;
636   PetscInt                     n = A->rmap->n;
637 
638   PetscFunctionBegin;
639   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
640   cusparseTriFactors->workVector = new THRUSTARRAY(n);
641   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
642 
643   /*lower triangular indices */
644   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
645   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
646   if (!perm_identity) {
647     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
648     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
649     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
650     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
651     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
652   }
653   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
654   PetscFunctionReturn(0);
655 }
656 
657 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
658 {
659   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
660   IS             isrow = b->row,iscol = b->col;
661   PetscBool      row_identity,col_identity;
662   PetscErrorCode ierr;
663 
664   PetscFunctionBegin;
665   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
666   /* determine which version of MatSolve needs to be used. */
667   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
668   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
669   if (row_identity && col_identity) {
670     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
671     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
672   } else {
673     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
674     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
675   }
676 
677   /* get the triangular factors */
678   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
679   PetscFunctionReturn(0);
680 }
681 
682 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
683 {
684   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
685   IS             ip = b->row;
686   PetscBool      perm_identity;
687   PetscErrorCode ierr;
688 
689   PetscFunctionBegin;
690   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
691 
692   /* determine which version of MatSolve needs to be used. */
693   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
694   if (perm_identity) {
695     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
696     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
697   } else {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
700   }
701 
702   /* get the triangular factors */
703   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
707 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
708 {
709   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
710   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
711   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
712   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
713   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
714   cusparseStatus_t                  stat;
715   cusparseIndexBase_t               indexBase;
716   cusparseMatrixType_t              matrixType;
717   cusparseFillMode_t                fillMode;
718   cusparseDiagType_t                diagType;
719 
720   PetscFunctionBegin;
721 
722   /*********************************************/
723   /* Now the Transpose of the Lower Tri Factor */
724   /*********************************************/
725 
726   /* allocate space for the transpose of the lower triangular factor */
727   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
728 
729   /* set the matrix descriptors of the lower triangular factor */
730   matrixType = cusparseGetMatType(loTriFactor->descr);
731   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
732   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
733     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
734   diagType = cusparseGetMatDiagType(loTriFactor->descr);
735 
736   /* Create the matrix description */
737   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
738   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
739   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
740   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
741   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
742 
743   /* Create the solve analysis information */
744   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
745 
746   /* set the operation */
747   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
748 
749   /* allocate GPU space for the CSC of the lower triangular factor*/
750   loTriFactorT->csrMat = new CsrMatrix;
751   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
752   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
753   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
754   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
755   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
756   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
757 
758   /* compute the transpose of the lower triangular factor, i.e. the CSC */
759   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
760                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
761                           loTriFactor->csrMat->values->data().get(),
762                           loTriFactor->csrMat->row_offsets->data().get(),
763                           loTriFactor->csrMat->column_indices->data().get(),
764                           loTriFactorT->csrMat->values->data().get(),
765                           loTriFactorT->csrMat->column_indices->data().get(),
766                           loTriFactorT->csrMat->row_offsets->data().get(),
767                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
768 
769   /* perform the solve analysis on the transposed matrix */
770   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
771                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
772                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
773                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
774                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
775 
776   /* assign the pointer. Is this really necessary? */
777   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
778 
779   /*********************************************/
780   /* Now the Transpose of the Upper Tri Factor */
781   /*********************************************/
782 
783   /* allocate space for the transpose of the upper triangular factor */
784   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
785 
786   /* set the matrix descriptors of the upper triangular factor */
787   matrixType = cusparseGetMatType(upTriFactor->descr);
788   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
789   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
790     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
791   diagType = cusparseGetMatDiagType(upTriFactor->descr);
792 
793   /* Create the matrix description */
794   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
795   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
796   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
797   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
798   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
799 
800   /* Create the solve analysis information */
801   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
802 
803   /* set the operation */
804   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
805 
806   /* allocate GPU space for the CSC of the upper triangular factor*/
807   upTriFactorT->csrMat = new CsrMatrix;
808   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
809   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
810   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
811   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
812   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
813   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
814 
815   /* compute the transpose of the upper triangular factor, i.e. the CSC */
816   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
817                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
818                           upTriFactor->csrMat->values->data().get(),
819                           upTriFactor->csrMat->row_offsets->data().get(),
820                           upTriFactor->csrMat->column_indices->data().get(),
821                           upTriFactorT->csrMat->values->data().get(),
822                           upTriFactorT->csrMat->column_indices->data().get(),
823                           upTriFactorT->csrMat->row_offsets->data().get(),
824                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
825 
826   /* perform the solve analysis on the transposed matrix */
827   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
828                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
829                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
830                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
831                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
832 
833   /* assign the pointer. Is this really necessary? */
834   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
835   PetscFunctionReturn(0);
836 }
837 
838 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
839 {
840   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
841   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
842   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
843   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
844   cusparseStatus_t             stat;
845   cusparseIndexBase_t          indexBase;
846   cudaError_t                  err;
847   PetscErrorCode               ierr;
848 
849   PetscFunctionBegin;
850 
851   /* allocate space for the triangular factor information */
852   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
853   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
854   indexBase = cusparseGetMatIndexBase(matstruct->descr);
855   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
856   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
857 
858   /* set alpha and beta */
859   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
860   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
861   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
862   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
863   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
864   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
865   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
866 
867   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
868     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
869     CsrMatrix *matrixT= new CsrMatrix;
870     matrixT->num_rows = A->cmap->n;
871     matrixT->num_cols = A->rmap->n;
872     matrixT->num_entries = a->nz;
873     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
874     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
875     matrixT->values = new THRUSTARRAY(a->nz);
876 
877     /* compute the transpose of the upper triangular factor, i.e. the CSC */
878     indexBase = cusparseGetMatIndexBase(matstruct->descr);
879     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
880                             matrix->num_cols, matrix->num_entries,
881                             matrix->values->data().get(),
882                             matrix->row_offsets->data().get(),
883                             matrix->column_indices->data().get(),
884                             matrixT->values->data().get(),
885                             matrixT->column_indices->data().get(),
886                             matrixT->row_offsets->data().get(),
887                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
888 
889     /* assign the pointer */
890     matstructT->mat = matrixT;
891     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
892   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
893     /* First convert HYB to CSR */
894     CsrMatrix *temp= new CsrMatrix;
895     temp->num_rows = A->rmap->n;
896     temp->num_cols = A->cmap->n;
897     temp->num_entries = a->nz;
898     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
899     temp->column_indices = new THRUSTINTARRAY32(a->nz);
900     temp->values = new THRUSTARRAY(a->nz);
901 
902 
903     stat = cusparse_hyb2csr(cusparsestruct->handle,
904                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
905                             temp->values->data().get(),
906                             temp->row_offsets->data().get(),
907                             temp->column_indices->data().get());CHKERRCUDA(stat);
908 
909     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
910     CsrMatrix *tempT= new CsrMatrix;
911     tempT->num_rows = A->rmap->n;
912     tempT->num_cols = A->cmap->n;
913     tempT->num_entries = a->nz;
914     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
915     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
916     tempT->values = new THRUSTARRAY(a->nz);
917 
918     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
919                             temp->num_cols, temp->num_entries,
920                             temp->values->data().get(),
921                             temp->row_offsets->data().get(),
922                             temp->column_indices->data().get(),
923                             tempT->values->data().get(),
924                             tempT->column_indices->data().get(),
925                             tempT->row_offsets->data().get(),
926                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
927 
928     /* Last, convert CSC to HYB */
929     cusparseHybMat_t hybMat;
930     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
931     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
932       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
933     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
934                             matstructT->descr, tempT->values->data().get(),
935                             tempT->row_offsets->data().get(),
936                             tempT->column_indices->data().get(),
937                             hybMat, 0, partition);CHKERRCUDA(stat);
938 
939     /* assign the pointer */
940     matstructT->mat = hybMat;
941     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
942 
943     /* delete temporaries */
944     if (tempT) {
945       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
946       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
947       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
948       delete (CsrMatrix*) tempT;
949     }
950     if (temp) {
951       if (temp->values) delete (THRUSTARRAY*) temp->values;
952       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
953       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
954       delete (CsrMatrix*) temp;
955     }
956   }
957   /* assign the compressed row indices */
958   matstructT->cprowIndices = new THRUSTINTARRAY;
959   matstructT->cprowIndices->resize(A->cmap->n);
960   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
961   /* assign the pointer */
962   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
963   PetscFunctionReturn(0);
964 }
965 
966 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
967 {
968   PetscInt                              n = xx->map->n;
969   const PetscScalar                     *barray;
970   PetscScalar                           *xarray;
971   thrust::device_ptr<const PetscScalar> bGPU;
972   thrust::device_ptr<PetscScalar>       xGPU;
973   cusparseStatus_t                      stat;
974   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
975   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
976   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
977   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
978   PetscErrorCode                        ierr;
979 
980   PetscFunctionBegin;
981   /* Analyze the matrix and create the transpose ... on the fly */
982   if (!loTriFactorT && !upTriFactorT) {
983     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
984     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
985     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
986   }
987 
988   /* Get the GPU pointers */
989   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
990   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
991   xGPU = thrust::device_pointer_cast(xarray);
992   bGPU = thrust::device_pointer_cast(barray);
993 
994   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
995   /* First, reorder with the row permutation */
996   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
997                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
998                xGPU);
999 
1000   /* First, solve U */
1001   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1002                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1003                         upTriFactorT->csrMat->values->data().get(),
1004                         upTriFactorT->csrMat->row_offsets->data().get(),
1005                         upTriFactorT->csrMat->column_indices->data().get(),
1006                         upTriFactorT->solveInfo,
1007                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1008 
1009   /* Then, solve L */
1010   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1011                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1012                         loTriFactorT->csrMat->values->data().get(),
1013                         loTriFactorT->csrMat->row_offsets->data().get(),
1014                         loTriFactorT->csrMat->column_indices->data().get(),
1015                         loTriFactorT->solveInfo,
1016                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1017 
1018   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1019   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1020                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1021                tempGPU->begin());
1022 
1023   /* Copy the temporary to the full solution. */
1024   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1025   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1026 
1027   /* restore */
1028   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1029   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1030   ierr = WaitForGPU();CHKERRCUDA(ierr);
1031   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1036 {
1037   const PetscScalar                 *barray;
1038   PetscScalar                       *xarray;
1039   cusparseStatus_t                  stat;
1040   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1041   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1042   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1043   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1044   PetscErrorCode                    ierr;
1045 
1046   PetscFunctionBegin;
1047   /* Analyze the matrix and create the transpose ... on the fly */
1048   if (!loTriFactorT && !upTriFactorT) {
1049     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1050     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1051     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1052   }
1053 
1054   /* Get the GPU pointers */
1055   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1056   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1057 
1058   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1059   /* First, solve U */
1060   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1061                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1062                         upTriFactorT->csrMat->values->data().get(),
1063                         upTriFactorT->csrMat->row_offsets->data().get(),
1064                         upTriFactorT->csrMat->column_indices->data().get(),
1065                         upTriFactorT->solveInfo,
1066                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1067 
1068   /* Then, solve L */
1069   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1070                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1071                         loTriFactorT->csrMat->values->data().get(),
1072                         loTriFactorT->csrMat->row_offsets->data().get(),
1073                         loTriFactorT->csrMat->column_indices->data().get(),
1074                         loTriFactorT->solveInfo,
1075                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1076   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1077 
1078   /* restore */
1079   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1080   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1081   ierr = WaitForGPU();CHKERRCUDA(ierr);
1082   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1083   PetscFunctionReturn(0);
1084 }
1085 
1086 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1087 {
1088   const PetscScalar                     *barray;
1089   PetscScalar                           *xarray;
1090   thrust::device_ptr<const PetscScalar> bGPU;
1091   thrust::device_ptr<PetscScalar>       xGPU;
1092   cusparseStatus_t                      stat;
1093   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1094   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1095   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1096   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1097   PetscErrorCode                        ierr;
1098 
1099   PetscFunctionBegin;
1100 
1101   /* Get the GPU pointers */
1102   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1103   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1104   xGPU = thrust::device_pointer_cast(xarray);
1105   bGPU = thrust::device_pointer_cast(barray);
1106 
1107   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1108   /* First, reorder with the row permutation */
1109   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1110                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1111                xGPU);
1112 
1113   /* Next, solve L */
1114   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1115                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1116                         loTriFactor->csrMat->values->data().get(),
1117                         loTriFactor->csrMat->row_offsets->data().get(),
1118                         loTriFactor->csrMat->column_indices->data().get(),
1119                         loTriFactor->solveInfo,
1120                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1121 
1122   /* Then, solve U */
1123   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1124                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1125                         upTriFactor->csrMat->values->data().get(),
1126                         upTriFactor->csrMat->row_offsets->data().get(),
1127                         upTriFactor->csrMat->column_indices->data().get(),
1128                         upTriFactor->solveInfo,
1129                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1130   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1131 
1132   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1133   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1134                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1135                tempGPU->begin());
1136 
1137   /* Copy the temporary to the full solution. */
1138   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1139 
1140   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1141   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1142   ierr = WaitForGPU();CHKERRCUDA(ierr);
1143   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1144   PetscFunctionReturn(0);
1145 }
1146 
1147 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1148 {
1149   const PetscScalar                 *barray;
1150   PetscScalar                       *xarray;
1151   cusparseStatus_t                  stat;
1152   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1153   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1154   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1155   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1156   PetscErrorCode                    ierr;
1157 
1158   PetscFunctionBegin;
1159   /* Get the GPU pointers */
1160   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1161   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1162 
1163   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1164   /* First, solve L */
1165   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1166                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1167                         loTriFactor->csrMat->values->data().get(),
1168                         loTriFactor->csrMat->row_offsets->data().get(),
1169                         loTriFactor->csrMat->column_indices->data().get(),
1170                         loTriFactor->solveInfo,
1171                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1172 
1173   /* Next, solve U */
1174   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1175                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1176                         upTriFactor->csrMat->values->data().get(),
1177                         upTriFactor->csrMat->row_offsets->data().get(),
1178                         upTriFactor->csrMat->column_indices->data().get(),
1179                         upTriFactor->solveInfo,
1180                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1181   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1182 
1183   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1184   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1185   ierr = WaitForGPU();CHKERRCUDA(ierr);
1186   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1191 {
1192   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1193   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1194   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1195   PetscInt                     m = A->rmap->n,*ii,*ridx;
1196   PetscErrorCode               ierr;
1197   cusparseStatus_t             stat;
1198   cudaError_t                  err;
1199 
1200   PetscFunctionBegin;
1201   if (A->pinnedtocpu) PetscFunctionReturn(0);
1202   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1203     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1204     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1205       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1206       /* copy values only */
1207       matrix->values->assign(a->a, a->a+a->nz);
1208       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1209     } else {
1210       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1211       try {
1212         cusparsestruct->nonzerorow=0;
1213         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1214 
1215         if (a->compressedrow.use) {
1216           m    = a->compressedrow.nrows;
1217           ii   = a->compressedrow.i;
1218           ridx = a->compressedrow.rindex;
1219         } else {
1220           /* Forcing compressed row on the GPU */
1221           int k=0;
1222           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1223           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1224           ii[0]=0;
1225           for (int j = 0; j<m; j++) {
1226             if ((a->i[j+1]-a->i[j])>0) {
1227               ii[k]  = a->i[j];
1228               ridx[k]= j;
1229               k++;
1230             }
1231           }
1232           ii[cusparsestruct->nonzerorow] = a->nz;
1233           m = cusparsestruct->nonzerorow;
1234         }
1235 
1236         /* allocate space for the triangular factor information */
1237         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1238         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1239         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1240         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1241 
1242         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1243         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1244         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1245         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1246         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1247         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1248         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1249 
1250         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1251         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1252           /* set the matrix */
1253           CsrMatrix *matrix= new CsrMatrix;
1254           matrix->num_rows = m;
1255           matrix->num_cols = A->cmap->n;
1256           matrix->num_entries = a->nz;
1257           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1258           matrix->row_offsets->assign(ii, ii + m+1);
1259 
1260           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1261           matrix->column_indices->assign(a->j, a->j+a->nz);
1262 
1263           matrix->values = new THRUSTARRAY(a->nz);
1264           matrix->values->assign(a->a, a->a+a->nz);
1265 
1266           /* assign the pointer */
1267           matstruct->mat = matrix;
1268 
1269         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1270           CsrMatrix *matrix= new CsrMatrix;
1271           matrix->num_rows = m;
1272           matrix->num_cols = A->cmap->n;
1273           matrix->num_entries = a->nz;
1274           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1275           matrix->row_offsets->assign(ii, ii + m+1);
1276 
1277           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1278           matrix->column_indices->assign(a->j, a->j+a->nz);
1279 
1280           matrix->values = new THRUSTARRAY(a->nz);
1281           matrix->values->assign(a->a, a->a+a->nz);
1282 
1283           cusparseHybMat_t hybMat;
1284           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1285           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1286             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1287           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1288               matstruct->descr, matrix->values->data().get(),
1289               matrix->row_offsets->data().get(),
1290               matrix->column_indices->data().get(),
1291               hybMat, 0, partition);CHKERRCUDA(stat);
1292           /* assign the pointer */
1293           matstruct->mat = hybMat;
1294 
1295           if (matrix) {
1296             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1297             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1298             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1299             delete (CsrMatrix*)matrix;
1300           }
1301         }
1302 
1303         /* assign the compressed row indices */
1304         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1305         matstruct->cprowIndices->assign(ridx,ridx+m);
1306         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1307 
1308         /* assign the pointer */
1309         cusparsestruct->mat = matstruct;
1310 
1311         if (!a->compressedrow.use) {
1312           ierr = PetscFree(ii);CHKERRQ(ierr);
1313           ierr = PetscFree(ridx);CHKERRQ(ierr);
1314         }
1315         cusparsestruct->workVector = new THRUSTARRAY(m);
1316       } catch(char *ex) {
1317         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1318       }
1319       cusparsestruct->nonzerostate = A->nonzerostate;
1320     }
1321     ierr = WaitForGPU();CHKERRCUDA(ierr);
1322     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1323     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1324   }
1325   PetscFunctionReturn(0);
1326 }
1327 
1328 struct VecCUDAPlusEquals
1329 {
1330   template <typename Tuple>
1331   __host__ __device__
1332   void operator()(Tuple t)
1333   {
1334     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1335   }
1336 };
1337 
1338 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1339 {
1340   PetscErrorCode ierr;
1341 
1342   PetscFunctionBegin;
1343   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1348 {
1349   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1350   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1351   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1352   const PetscScalar            *xarray;
1353   PetscScalar                  *yarray;
1354   PetscErrorCode               ierr;
1355   cusparseStatus_t             stat;
1356 
1357   PetscFunctionBegin;
1358   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1359   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1360   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1361   if (!matstructT) {
1362     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1363     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1364   }
1365   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1366   ierr = VecSet(yy,0);CHKERRQ(ierr);
1367   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1368 
1369   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1370   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1371     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1372     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1373                              mat->num_rows, mat->num_cols,
1374                              mat->num_entries, matstructT->alpha, matstructT->descr,
1375                              mat->values->data().get(), mat->row_offsets->data().get(),
1376                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1377                              yarray);CHKERRCUDA(stat);
1378   } else {
1379     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1380     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1381                              matstructT->alpha, matstructT->descr, hybMat,
1382                              xarray, matstructT->beta_zero,
1383                              yarray);CHKERRCUDA(stat);
1384   }
1385   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1386   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1387   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1388   if (!cusparsestruct->stream) {
1389     ierr = WaitForGPU();CHKERRCUDA(ierr);
1390   }
1391   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1392   PetscFunctionReturn(0);
1393 }
1394 
1395 
1396 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1397 {
1398   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1399   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1400   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1401   const PetscScalar            *xarray;
1402   PetscScalar                  *zarray,*dptr,*beta;
1403   PetscErrorCode               ierr;
1404   cusparseStatus_t             stat;
1405   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1406 
1407   PetscFunctionBegin;
1408   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1409   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1410   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1411   try {
1412     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1413     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1414     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1415       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1416     } else {
1417       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1418     }
1419     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1420     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1421 
1422     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1423     /* csr_spmv is multiply add */
1424     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1425       /* here we need to be careful to set the number of rows in the multiply to the
1426          number of compressed rows in the matrix ... which is equivalent to the
1427          size of the workVector */
1428       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1429       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1430                                mat->num_rows, mat->num_cols,
1431                                mat->num_entries, matstruct->alpha, matstruct->descr,
1432                                mat->values->data().get(), mat->row_offsets->data().get(),
1433                                mat->column_indices->data().get(), xarray, beta,
1434                                dptr);CHKERRCUDA(stat);
1435     } else {
1436       if (cusparsestruct->workVector->size()) {
1437 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1438         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1439                                  matstruct->alpha, matstruct->descr, hybMat,
1440                                  xarray, beta,
1441                                  dptr);CHKERRCUDA(stat);
1442       }
1443     }
1444     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1445 
1446     if (yy) {
1447       if (dptr != zarray) {
1448         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1449       } else if (zz != yy) {
1450         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1451       }
1452     } else if (dptr != zarray) {
1453       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1454     }
1455     /* scatter the data from the temporary into the full vector with a += operation */
1456     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1457     if (dptr != zarray) {
1458       thrust::device_ptr<PetscScalar> zptr;
1459 
1460       zptr = thrust::device_pointer_cast(zarray);
1461       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1462                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1463                        VecCUDAPlusEquals());
1464     }
1465     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1466     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1467     if (yy && !cmpr) {
1468       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1469     } else {
1470       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1471     }
1472   } catch(char *ex) {
1473     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1474   }
1475   if (!yy) { /* MatMult */
1476     if (!cusparsestruct->stream) {
1477       ierr = WaitForGPU();CHKERRCUDA(ierr);
1478     }
1479   }
1480   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1481   PetscFunctionReturn(0);
1482 }
1483 
1484 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1485 {
1486   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1487   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1488   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1489   thrust::device_ptr<PetscScalar> zptr;
1490   const PetscScalar               *xarray;
1491   PetscScalar                     *zarray;
1492   PetscErrorCode                  ierr;
1493   cusparseStatus_t                stat;
1494 
1495   PetscFunctionBegin;
1496   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1497   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1498   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1499   if (!matstructT) {
1500     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1501     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1502   }
1503 
1504   try {
1505     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1506     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1507     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1508     zptr = thrust::device_pointer_cast(zarray);
1509 
1510     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1511     /* multiply add with matrix transpose */
1512     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1513       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1514       /* here we need to be careful to set the number of rows in the multiply to the
1515          number of compressed rows in the matrix ... which is equivalent to the
1516          size of the workVector */
1517       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1518                                mat->num_rows, mat->num_cols,
1519                                mat->num_entries, matstructT->alpha, matstructT->descr,
1520                                mat->values->data().get(), mat->row_offsets->data().get(),
1521                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1522                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1523     } else {
1524       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1525       if (cusparsestruct->workVector->size()) {
1526         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1527             matstructT->alpha, matstructT->descr, hybMat,
1528             xarray, matstructT->beta_zero,
1529             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1530       }
1531     }
1532 
1533     /* scatter the data from the temporary into the full vector with a += operation */
1534     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1535         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1536         VecCUDAPlusEquals());
1537     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1538     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1539     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1540 
1541   } catch(char *ex) {
1542     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1543   }
1544   ierr = WaitForGPU();CHKERRCUDA(ierr);
1545   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1546   PetscFunctionReturn(0);
1547 }
1548 
1549 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1550 {
1551   PetscErrorCode ierr;
1552 
1553   PetscFunctionBegin;
1554   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1555   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1556   if (A->factortype == MAT_FACTOR_NONE) {
1557     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1558   }
1559   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1560   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1561   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1562   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 /* --------------------------------------------------------------------------------*/
1567 /*@
1568    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1569    (the default parallel PETSc format). This matrix will ultimately pushed down
1570    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1571    assembly performance the user should preallocate the matrix storage by setting
1572    the parameter nz (or the array nnz).  By setting these parameters accurately,
1573    performance during matrix assembly can be increased by more than a factor of 50.
1574 
1575    Collective
1576 
1577    Input Parameters:
1578 +  comm - MPI communicator, set to PETSC_COMM_SELF
1579 .  m - number of rows
1580 .  n - number of columns
1581 .  nz - number of nonzeros per row (same for all rows)
1582 -  nnz - array containing the number of nonzeros in the various rows
1583          (possibly different for each row) or NULL
1584 
1585    Output Parameter:
1586 .  A - the matrix
1587 
1588    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1589    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1590    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1591 
1592    Notes:
1593    If nnz is given then nz is ignored
1594 
1595    The AIJ format (also called the Yale sparse matrix format or
1596    compressed row storage), is fully compatible with standard Fortran 77
1597    storage.  That is, the stored row and column indices can begin at
1598    either one (as in Fortran) or zero.  See the users' manual for details.
1599 
1600    Specify the preallocated storage with either nz or nnz (not both).
1601    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1602    allocation.  For large problems you MUST preallocate memory or you
1603    will get TERRIBLE performance, see the users' manual chapter on matrices.
1604 
1605    By default, this format uses inodes (identical nodes) when possible, to
1606    improve numerical efficiency of matrix-vector products and solves. We
1607    search for consecutive rows with the same nonzero structure, thereby
1608    reusing matrix information to achieve increased efficiency.
1609 
1610    Level: intermediate
1611 
1612 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1613 @*/
1614 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1615 {
1616   PetscErrorCode ierr;
1617 
1618   PetscFunctionBegin;
1619   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1620   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1621   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1622   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1623   PetscFunctionReturn(0);
1624 }
1625 
1626 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1627 {
1628   PetscErrorCode   ierr;
1629 
1630   PetscFunctionBegin;
1631   if (A->factortype==MAT_FACTOR_NONE) {
1632     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1633       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1634     }
1635   } else {
1636     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1637   }
1638   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1639   PetscFunctionReturn(0);
1640 }
1641 
1642 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1643 {
1644   PetscErrorCode ierr;
1645   Mat C;
1646   cusparseStatus_t stat;
1647   cusparseHandle_t handle=0;
1648 
1649   PetscFunctionBegin;
1650   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1651   C    = *B;
1652   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1653   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1654 
1655   /* inject CUSPARSE-specific stuff */
1656   if (C->factortype==MAT_FACTOR_NONE) {
1657     /* you cannot check the inode.use flag here since the matrix was just created.
1658        now build a GPU matrix data structure */
1659     C->spptr = new Mat_SeqAIJCUSPARSE;
1660     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1661     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1662     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1663     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1664     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1665     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1666     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1667     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1668   } else {
1669     /* NEXT, set the pointers to the triangular factors */
1670     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1671     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1672     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1673     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1674     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1675     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1676     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1677     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1678     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1679     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1680     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1681     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1682   }
1683 
1684   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1685   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1686   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1687   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1688   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1689   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1690   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1691   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1692 
1693   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1694 
1695   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1696 
1697   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1702 {
1703   PetscErrorCode ierr;
1704   cusparseStatus_t stat;
1705   cusparseHandle_t handle=0;
1706 
1707   PetscFunctionBegin;
1708   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1709   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1710 
1711   if (B->factortype==MAT_FACTOR_NONE) {
1712     /* you cannot check the inode.use flag here since the matrix was just created.
1713        now build a GPU matrix data structure */
1714     B->spptr = new Mat_SeqAIJCUSPARSE;
1715     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1716     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1717     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1718     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1719     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1720     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1721     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1722     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1723   } else {
1724     /* NEXT, set the pointers to the triangular factors */
1725     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1726     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1727     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1728     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1729     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1731     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1733     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1734     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1736   }
1737 
1738   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1739   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1740   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1741   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1742   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1743   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1744   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1745   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1746 
1747   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1748 
1749   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1750 
1751   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1756 {
1757   PetscErrorCode ierr;
1758 
1759   PetscFunctionBegin;
1760   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1761   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1762   PetscFunctionReturn(0);
1763 }
1764 
1765 /*MC
1766    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1767 
1768    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1769    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1770    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1771 
1772    Options Database Keys:
1773 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1774 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1775 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1776 
1777   Level: beginner
1778 
1779 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1780 M*/
1781 
1782 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1783 
1784 
1785 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1786 {
1787   PetscErrorCode ierr;
1788 
1789   PetscFunctionBegin;
1790   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1791   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1792   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1793   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1794   PetscFunctionReturn(0);
1795 }
1796 
1797 
1798 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1799 {
1800   cusparseStatus_t stat;
1801   cusparseHandle_t handle;
1802 
1803   PetscFunctionBegin;
1804   if (*cusparsestruct) {
1805     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1806     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1807     delete (*cusparsestruct)->workVector;
1808     if (handle = (*cusparsestruct)->handle) {
1809       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1810     }
1811     delete *cusparsestruct;
1812     *cusparsestruct = 0;
1813   }
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1818 {
1819   PetscFunctionBegin;
1820   if (*mat) {
1821     delete (*mat)->values;
1822     delete (*mat)->column_indices;
1823     delete (*mat)->row_offsets;
1824     delete *mat;
1825     *mat = 0;
1826   }
1827   PetscFunctionReturn(0);
1828 }
1829 
1830 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1831 {
1832   cusparseStatus_t stat;
1833   PetscErrorCode   ierr;
1834 
1835   PetscFunctionBegin;
1836   if (*trifactor) {
1837     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1838     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1839     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1840     delete *trifactor;
1841     *trifactor = 0;
1842   }
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1847 {
1848   CsrMatrix        *mat;
1849   cusparseStatus_t stat;
1850   cudaError_t      err;
1851 
1852   PetscFunctionBegin;
1853   if (*matstruct) {
1854     if ((*matstruct)->mat) {
1855       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1856         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1857         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1858       } else {
1859         mat = (CsrMatrix*)(*matstruct)->mat;
1860         CsrMatrix_Destroy(&mat);
1861       }
1862     }
1863     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1864     delete (*matstruct)->cprowIndices;
1865     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1866     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1867     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1868     delete *matstruct;
1869     *matstruct = 0;
1870   }
1871   PetscFunctionReturn(0);
1872 }
1873 
1874 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1875 {
1876   cusparseHandle_t handle;
1877   cusparseStatus_t stat;
1878 
1879   PetscFunctionBegin;
1880   if (*trifactors) {
1881     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1882     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1883     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1884     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1885     delete (*trifactors)->rpermIndices;
1886     delete (*trifactors)->cpermIndices;
1887     delete (*trifactors)->workVector;
1888     if (handle = (*trifactors)->handle) {
1889       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1890     }
1891     delete *trifactors;
1892     *trifactors = 0;
1893   }
1894   PetscFunctionReturn(0);
1895 }
1896 
1897