xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 301298b4754cb4d1332dd3ce13d2561d0dabdd3e)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127   switch (op) {
128   case MAT_CUSPARSE_MULT:
129     cusparsestruct->format = format;
130     break;
131   case MAT_CUSPARSE_ALL:
132     cusparsestruct->format = format;
133     break;
134   default:
135     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 /*@
141    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
142    operation. Only the MatMult operation can use different GPU storage formats
143    for MPIAIJCUSPARSE matrices.
144    Not Collective
145 
146    Input Parameters:
147 +  A - Matrix of type SEQAIJCUSPARSE
148 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
149 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
150 
151    Output Parameter:
152 
153    Level: intermediate
154 
155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
156 @*/
157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
158 {
159   PetscErrorCode ierr;
160 
161   PetscFunctionBegin;
162   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
163   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
164   PetscFunctionReturn(0);
165 }
166 
167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
168 {
169   PetscErrorCode           ierr;
170   MatCUSPARSEStorageFormat format;
171   PetscBool                flg;
172   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
173 
174   PetscFunctionBegin;
175   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
176   if (A->factortype==MAT_FACTOR_NONE) {
177     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
178                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179     if (flg) {
180       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
181     }
182   }
183   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
184                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185   if (flg) {
186     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
187   }
188   ierr = PetscOptionsTail();CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 
191 }
192 
193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
194 {
195   PetscErrorCode ierr;
196 
197   PetscFunctionBegin;
198   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
199   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
204 {
205   PetscErrorCode ierr;
206 
207   PetscFunctionBegin;
208   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
209   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
210   PetscFunctionReturn(0);
211 }
212 
213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
224 {
225   PetscErrorCode ierr;
226 
227   PetscFunctionBegin;
228   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
229   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
230   PetscFunctionReturn(0);
231 }
232 
233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
234 {
235   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
236   PetscInt                          n = A->rmap->n;
237   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
238   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
239   cusparseStatus_t                  stat;
240   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
241   const MatScalar                   *aa = a->a,*v;
242   PetscInt                          *AiLo, *AjLo;
243   PetscScalar                       *AALo;
244   PetscInt                          i,nz, nzLower, offset, rowOffset;
245   PetscErrorCode                    ierr;
246 
247   PetscFunctionBegin;
248   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
249     try {
250       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
251       nzLower=n+ai[n]-ai[1];
252 
253       /* Allocate Space for the lower triangular matrix */
254       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
255       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
256       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
257 
258       /* Fill the lower triangular matrix */
259       AiLo[0]  = (PetscInt) 0;
260       AiLo[n]  = nzLower;
261       AjLo[0]  = (PetscInt) 0;
262       AALo[0]  = (MatScalar) 1.0;
263       v        = aa;
264       vi       = aj;
265       offset   = 1;
266       rowOffset= 1;
267       for (i=1; i<n; i++) {
268         nz = ai[i+1] - ai[i];
269         /* additional 1 for the term on the diagonal */
270         AiLo[i]    = rowOffset;
271         rowOffset += nz+1;
272 
273         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
274         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
275 
276         offset      += nz;
277         AjLo[offset] = (PetscInt) i;
278         AALo[offset] = (MatScalar) 1.0;
279         offset      += 1;
280 
281         v  += nz;
282         vi += nz;
283       }
284 
285       /* allocate space for the triangular factor information */
286       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
287 
288       /* Create the matrix description */
289       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
290       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
291       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
292       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
293       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
294 
295       /* Create the solve analysis information */
296       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
297 
298       /* set the operation */
299       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
300 
301       /* set the matrix */
302       loTriFactor->csrMat = new CsrMatrix;
303       loTriFactor->csrMat->num_rows = n;
304       loTriFactor->csrMat->num_cols = n;
305       loTriFactor->csrMat->num_entries = nzLower;
306 
307       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
308       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
309 
310       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
311       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
312 
313       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
314       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
315 
316       /* perform the solve analysis */
317       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
318                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
319                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
320                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
321 
322       /* assign the pointer. Is this really necessary? */
323       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
324 
325       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
326       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
327       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
328       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
329     } catch(char *ex) {
330       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
331     }
332   }
333   PetscFunctionReturn(0);
334 }
335 
336 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
337 {
338   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
339   PetscInt                          n = A->rmap->n;
340   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
341   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
342   cusparseStatus_t                  stat;
343   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
344   const MatScalar                   *aa = a->a,*v;
345   PetscInt                          *AiUp, *AjUp;
346   PetscScalar                       *AAUp;
347   PetscInt                          i,nz, nzUpper, offset;
348   PetscErrorCode                    ierr;
349 
350   PetscFunctionBegin;
351   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
352     try {
353       /* next, figure out the number of nonzeros in the upper triangular matrix. */
354       nzUpper = adiag[0]-adiag[n];
355 
356       /* Allocate Space for the upper triangular matrix */
357       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
358       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
359       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
360 
361       /* Fill the upper triangular matrix */
362       AiUp[0]=(PetscInt) 0;
363       AiUp[n]=nzUpper;
364       offset = nzUpper;
365       for (i=n-1; i>=0; i--) {
366         v  = aa + adiag[i+1] + 1;
367         vi = aj + adiag[i+1] + 1;
368 
369         /* number of elements NOT on the diagonal */
370         nz = adiag[i] - adiag[i+1]-1;
371 
372         /* decrement the offset */
373         offset -= (nz+1);
374 
375         /* first, set the diagonal elements */
376         AjUp[offset] = (PetscInt) i;
377         AAUp[offset] = (MatScalar)1./v[nz];
378         AiUp[i]      = AiUp[i+1] - (nz+1);
379 
380         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
381         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
382       }
383 
384       /* allocate space for the triangular factor information */
385       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
386 
387       /* Create the matrix description */
388       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
389       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
390       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
391       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
392       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
393 
394       /* Create the solve analysis information */
395       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
396 
397       /* set the operation */
398       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
399 
400       /* set the matrix */
401       upTriFactor->csrMat = new CsrMatrix;
402       upTriFactor->csrMat->num_rows = n;
403       upTriFactor->csrMat->num_cols = n;
404       upTriFactor->csrMat->num_entries = nzUpper;
405 
406       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
407       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
408 
409       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
410       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
411 
412       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
413       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
414 
415       /* perform the solve analysis */
416       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
417                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
418                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
419                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
420 
421       /* assign the pointer. Is this really necessary? */
422       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
423 
424       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
425       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
426       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
427       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
428     } catch(char *ex) {
429       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
430     }
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
436 {
437   PetscErrorCode               ierr;
438   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
439   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
440   IS                           isrow = a->row,iscol = a->icol;
441   PetscBool                    row_identity,col_identity;
442   const PetscInt               *r,*c;
443   PetscInt                     n = A->rmap->n;
444 
445   PetscFunctionBegin;
446   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
447   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
448 
449   cusparseTriFactors->workVector = new THRUSTARRAY(n);
450   cusparseTriFactors->nnz=a->nz;
451 
452   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
453   /*lower triangular indices */
454   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
455   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
456   if (!row_identity) {
457     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
458     cusparseTriFactors->rpermIndices->assign(r, r+n);
459   }
460   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
461 
462   /*upper triangular indices */
463   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
464   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
465   if (!col_identity) {
466     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
467     cusparseTriFactors->cpermIndices->assign(c, c+n);
468   }
469 
470   if(!row_identity && !col_identity) {
471     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
472   } else if(!row_identity) {
473     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
474   } else if(!col_identity) {
475     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
476   }
477 
478   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
479   PetscFunctionReturn(0);
480 }
481 
482 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
483 {
484   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
485   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
486   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
487   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
488   cusparseStatus_t                  stat;
489   PetscErrorCode                    ierr;
490   PetscInt                          *AiUp, *AjUp;
491   PetscScalar                       *AAUp;
492   PetscScalar                       *AALo;
493   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
494   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
495   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
496   const MatScalar                   *aa = b->a,*v;
497 
498   PetscFunctionBegin;
499   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
500     try {
501       /* Allocate Space for the upper triangular matrix */
502       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
503       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
504       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
505       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
506 
507       /* Fill the upper triangular matrix */
508       AiUp[0]=(PetscInt) 0;
509       AiUp[n]=nzUpper;
510       offset = 0;
511       for (i=0; i<n; i++) {
512         /* set the pointers */
513         v  = aa + ai[i];
514         vj = aj + ai[i];
515         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
516 
517         /* first, set the diagonal elements */
518         AjUp[offset] = (PetscInt) i;
519         AAUp[offset] = (MatScalar)1.0/v[nz];
520         AiUp[i]      = offset;
521         AALo[offset] = (MatScalar)1.0/v[nz];
522 
523         offset+=1;
524         if (nz>0) {
525           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
526           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
527           for (j=offset; j<offset+nz; j++) {
528             AAUp[j] = -AAUp[j];
529             AALo[j] = AAUp[j]/v[nz];
530           }
531           offset+=nz;
532         }
533       }
534 
535       /* allocate space for the triangular factor information */
536       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
537 
538       /* Create the matrix description */
539       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
540       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
541       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
542       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
543       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
544 
545       /* Create the solve analysis information */
546       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
547 
548       /* set the operation */
549       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
550 
551       /* set the matrix */
552       upTriFactor->csrMat = new CsrMatrix;
553       upTriFactor->csrMat->num_rows = A->rmap->n;
554       upTriFactor->csrMat->num_cols = A->cmap->n;
555       upTriFactor->csrMat->num_entries = a->nz;
556 
557       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
558       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
559 
560       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
561       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
562 
563       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
564       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
565 
566       /* perform the solve analysis */
567       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
568                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
569                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
570                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
571 
572       /* assign the pointer. Is this really necessary? */
573       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
574 
575       /* allocate space for the triangular factor information */
576       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
577 
578       /* Create the matrix description */
579       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
580       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
581       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
582       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
583       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
584 
585       /* Create the solve analysis information */
586       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
587 
588       /* set the operation */
589       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
590 
591       /* set the matrix */
592       loTriFactor->csrMat = new CsrMatrix;
593       loTriFactor->csrMat->num_rows = A->rmap->n;
594       loTriFactor->csrMat->num_cols = A->cmap->n;
595       loTriFactor->csrMat->num_entries = a->nz;
596 
597       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
598       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
599 
600       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
601       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
602 
603       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
604       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
605       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
606 
607       /* perform the solve analysis */
608       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
609                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
610                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
611                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
612 
613       /* assign the pointer. Is this really necessary? */
614       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
615 
616       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
617       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
618       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
619       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
620       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
621     } catch(char *ex) {
622       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
623     }
624   }
625   PetscFunctionReturn(0);
626 }
627 
628 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
629 {
630   PetscErrorCode               ierr;
631   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
632   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
633   IS                           ip = a->row;
634   const PetscInt               *rip;
635   PetscBool                    perm_identity;
636   PetscInt                     n = A->rmap->n;
637 
638   PetscFunctionBegin;
639   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
640   cusparseTriFactors->workVector = new THRUSTARRAY(n);
641   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
642 
643   /*lower triangular indices */
644   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
645   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
646   if (!perm_identity) {
647     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
648     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
649     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
650     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
651     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
652   }
653   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
654   PetscFunctionReturn(0);
655 }
656 
657 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
658 {
659   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
660   IS             isrow = b->row,iscol = b->col;
661   PetscBool      row_identity,col_identity;
662   PetscErrorCode ierr;
663 
664   PetscFunctionBegin;
665   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
666   /* determine which version of MatSolve needs to be used. */
667   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
668   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
669   if (row_identity && col_identity) {
670     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
671     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
672   } else {
673     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
674     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
675   }
676 
677   /* get the triangular factors */
678   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
679   PetscFunctionReturn(0);
680 }
681 
682 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
683 {
684   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
685   IS             ip = b->row;
686   PetscBool      perm_identity;
687   PetscErrorCode ierr;
688 
689   PetscFunctionBegin;
690   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
691 
692   /* determine which version of MatSolve needs to be used. */
693   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
694   if (perm_identity) {
695     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
696     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
697   } else {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
700   }
701 
702   /* get the triangular factors */
703   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
707 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
708 {
709   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
710   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
711   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
712   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
713   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
714   cusparseStatus_t                  stat;
715   cusparseIndexBase_t               indexBase;
716   cusparseMatrixType_t              matrixType;
717   cusparseFillMode_t                fillMode;
718   cusparseDiagType_t                diagType;
719 
720   PetscFunctionBegin;
721 
722   /*********************************************/
723   /* Now the Transpose of the Lower Tri Factor */
724   /*********************************************/
725 
726   /* allocate space for the transpose of the lower triangular factor */
727   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
728 
729   /* set the matrix descriptors of the lower triangular factor */
730   matrixType = cusparseGetMatType(loTriFactor->descr);
731   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
732   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
733     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
734   diagType = cusparseGetMatDiagType(loTriFactor->descr);
735 
736   /* Create the matrix description */
737   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
738   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
739   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
740   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
741   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
742 
743   /* Create the solve analysis information */
744   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
745 
746   /* set the operation */
747   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
748 
749   /* allocate GPU space for the CSC of the lower triangular factor*/
750   loTriFactorT->csrMat = new CsrMatrix;
751   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
752   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
753   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
754   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
755   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
756   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
757 
758   /* compute the transpose of the lower triangular factor, i.e. the CSC */
759   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
760                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
761                           loTriFactor->csrMat->values->data().get(),
762                           loTriFactor->csrMat->row_offsets->data().get(),
763                           loTriFactor->csrMat->column_indices->data().get(),
764                           loTriFactorT->csrMat->values->data().get(),
765                           loTriFactorT->csrMat->column_indices->data().get(),
766                           loTriFactorT->csrMat->row_offsets->data().get(),
767                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
768 
769   /* perform the solve analysis on the transposed matrix */
770   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
771                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
772                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
773                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
774                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
775 
776   /* assign the pointer. Is this really necessary? */
777   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
778 
779   /*********************************************/
780   /* Now the Transpose of the Upper Tri Factor */
781   /*********************************************/
782 
783   /* allocate space for the transpose of the upper triangular factor */
784   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
785 
786   /* set the matrix descriptors of the upper triangular factor */
787   matrixType = cusparseGetMatType(upTriFactor->descr);
788   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
789   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
790     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
791   diagType = cusparseGetMatDiagType(upTriFactor->descr);
792 
793   /* Create the matrix description */
794   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
795   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
796   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
797   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
798   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
799 
800   /* Create the solve analysis information */
801   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
802 
803   /* set the operation */
804   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
805 
806   /* allocate GPU space for the CSC of the upper triangular factor*/
807   upTriFactorT->csrMat = new CsrMatrix;
808   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
809   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
810   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
811   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
812   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
813   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
814 
815   /* compute the transpose of the upper triangular factor, i.e. the CSC */
816   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
817                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
818                           upTriFactor->csrMat->values->data().get(),
819                           upTriFactor->csrMat->row_offsets->data().get(),
820                           upTriFactor->csrMat->column_indices->data().get(),
821                           upTriFactorT->csrMat->values->data().get(),
822                           upTriFactorT->csrMat->column_indices->data().get(),
823                           upTriFactorT->csrMat->row_offsets->data().get(),
824                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
825 
826   /* perform the solve analysis on the transposed matrix */
827   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
828                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
829                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
830                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
831                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
832 
833   /* assign the pointer. Is this really necessary? */
834   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
835   PetscFunctionReturn(0);
836 }
837 
838 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
839 {
840   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
841   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
842   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
843   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
844   cusparseStatus_t             stat;
845   cusparseIndexBase_t          indexBase;
846   cudaError_t                  err;
847   PetscErrorCode               ierr;
848 
849   PetscFunctionBegin;
850 
851   /* allocate space for the triangular factor information */
852   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
853   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
854   indexBase = cusparseGetMatIndexBase(matstruct->descr);
855   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
856   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
857 
858   /* set alpha and beta */
859   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
860   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
861   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
862   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
863   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
864   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
865   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
866 
867   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
868     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
869     CsrMatrix *matrixT= new CsrMatrix;
870     matrixT->num_rows = A->cmap->n;
871     matrixT->num_cols = A->rmap->n;
872     matrixT->num_entries = a->nz;
873     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
874     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
875     matrixT->values = new THRUSTARRAY(a->nz);
876 
877     /* compute the transpose of the upper triangular factor, i.e. the CSC */
878     indexBase = cusparseGetMatIndexBase(matstruct->descr);
879     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
880                             matrix->num_cols, matrix->num_entries,
881                             matrix->values->data().get(),
882                             matrix->row_offsets->data().get(),
883                             matrix->column_indices->data().get(),
884                             matrixT->values->data().get(),
885                             matrixT->column_indices->data().get(),
886                             matrixT->row_offsets->data().get(),
887                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
888 
889     /* assign the pointer */
890     matstructT->mat = matrixT;
891     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
892   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
893 #if CUDA_VERSION>=5000
894     /* First convert HYB to CSR */
895     CsrMatrix *temp= new CsrMatrix;
896     temp->num_rows = A->rmap->n;
897     temp->num_cols = A->cmap->n;
898     temp->num_entries = a->nz;
899     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
900     temp->column_indices = new THRUSTINTARRAY32(a->nz);
901     temp->values = new THRUSTARRAY(a->nz);
902 
903 
904     stat = cusparse_hyb2csr(cusparsestruct->handle,
905                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
906                             temp->values->data().get(),
907                             temp->row_offsets->data().get(),
908                             temp->column_indices->data().get());CHKERRCUDA(stat);
909 
910     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
911     CsrMatrix *tempT= new CsrMatrix;
912     tempT->num_rows = A->rmap->n;
913     tempT->num_cols = A->cmap->n;
914     tempT->num_entries = a->nz;
915     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
916     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
917     tempT->values = new THRUSTARRAY(a->nz);
918 
919     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
920                             temp->num_cols, temp->num_entries,
921                             temp->values->data().get(),
922                             temp->row_offsets->data().get(),
923                             temp->column_indices->data().get(),
924                             tempT->values->data().get(),
925                             tempT->column_indices->data().get(),
926                             tempT->row_offsets->data().get(),
927                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
928 
929     /* Last, convert CSC to HYB */
930     cusparseHybMat_t hybMat;
931     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
932     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
933       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
934     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
935                             matstructT->descr, tempT->values->data().get(),
936                             tempT->row_offsets->data().get(),
937                             tempT->column_indices->data().get(),
938                             hybMat, 0, partition);CHKERRCUDA(stat);
939 
940     /* assign the pointer */
941     matstructT->mat = hybMat;
942     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
943 
944     /* delete temporaries */
945     if (tempT) {
946       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
947       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
948       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
949       delete (CsrMatrix*) tempT;
950     }
951     if (temp) {
952       if (temp->values) delete (THRUSTARRAY*) temp->values;
953       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
954       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
955       delete (CsrMatrix*) temp;
956     }
957 #else
958     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
959 #endif
960   }
961   /* assign the compressed row indices */
962   matstructT->cprowIndices = new THRUSTINTARRAY;
963   matstructT->cprowIndices->resize(A->cmap->n);
964   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
965   /* assign the pointer */
966   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
967   PetscFunctionReturn(0);
968 }
969 
970 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
971 {
972   PetscInt                              n = xx->map->n;
973   const PetscScalar                     *barray;
974   PetscScalar                           *xarray;
975   thrust::device_ptr<const PetscScalar> bGPU;
976   thrust::device_ptr<PetscScalar>       xGPU;
977   cusparseStatus_t                      stat;
978   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
979   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
980   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
981   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
982   PetscErrorCode                        ierr;
983 
984   PetscFunctionBegin;
985   /* Analyze the matrix and create the transpose ... on the fly */
986   if (!loTriFactorT && !upTriFactorT) {
987     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
988     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
989     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
990   }
991 
992   /* Get the GPU pointers */
993   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
994   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
995   xGPU = thrust::device_pointer_cast(xarray);
996   bGPU = thrust::device_pointer_cast(barray);
997 
998   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
999   /* First, reorder with the row permutation */
1000   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1001                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1002                xGPU);
1003 
1004   /* First, solve U */
1005   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1006                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1007                         upTriFactorT->csrMat->values->data().get(),
1008                         upTriFactorT->csrMat->row_offsets->data().get(),
1009                         upTriFactorT->csrMat->column_indices->data().get(),
1010                         upTriFactorT->solveInfo,
1011                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1012 
1013   /* Then, solve L */
1014   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1015                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1016                         loTriFactorT->csrMat->values->data().get(),
1017                         loTriFactorT->csrMat->row_offsets->data().get(),
1018                         loTriFactorT->csrMat->column_indices->data().get(),
1019                         loTriFactorT->solveInfo,
1020                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1021 
1022   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1023   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1024                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1025                tempGPU->begin());
1026 
1027   /* Copy the temporary to the full solution. */
1028   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1029   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1030 
1031   /* restore */
1032   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1033   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1034   ierr = WaitForGPU();CHKERRCUDA(ierr);
1035   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1036   PetscFunctionReturn(0);
1037 }
1038 
1039 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1040 {
1041   const PetscScalar                 *barray;
1042   PetscScalar                       *xarray;
1043   cusparseStatus_t                  stat;
1044   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1045   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1046   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1047   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1048   PetscErrorCode                    ierr;
1049 
1050   PetscFunctionBegin;
1051   /* Analyze the matrix and create the transpose ... on the fly */
1052   if (!loTriFactorT && !upTriFactorT) {
1053     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1054     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1055     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1056   }
1057 
1058   /* Get the GPU pointers */
1059   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1060   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1061 
1062   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1063   /* First, solve U */
1064   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1065                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1066                         upTriFactorT->csrMat->values->data().get(),
1067                         upTriFactorT->csrMat->row_offsets->data().get(),
1068                         upTriFactorT->csrMat->column_indices->data().get(),
1069                         upTriFactorT->solveInfo,
1070                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1071 
1072   /* Then, solve L */
1073   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1074                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1075                         loTriFactorT->csrMat->values->data().get(),
1076                         loTriFactorT->csrMat->row_offsets->data().get(),
1077                         loTriFactorT->csrMat->column_indices->data().get(),
1078                         loTriFactorT->solveInfo,
1079                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1080   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1081 
1082   /* restore */
1083   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1084   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1085   ierr = WaitForGPU();CHKERRCUDA(ierr);
1086   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1087   PetscFunctionReturn(0);
1088 }
1089 
1090 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1091 {
1092   const PetscScalar                     *barray;
1093   PetscScalar                           *xarray;
1094   thrust::device_ptr<const PetscScalar> bGPU;
1095   thrust::device_ptr<PetscScalar>       xGPU;
1096   cusparseStatus_t                      stat;
1097   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1098   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1099   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1100   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1101   PetscErrorCode                        ierr;
1102 
1103   PetscFunctionBegin;
1104 
1105   /* Get the GPU pointers */
1106   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1107   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1108   xGPU = thrust::device_pointer_cast(xarray);
1109   bGPU = thrust::device_pointer_cast(barray);
1110 
1111   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1112   /* First, reorder with the row permutation */
1113   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1114                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1115                xGPU);
1116 
1117   /* Next, solve L */
1118   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1119                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1120                         loTriFactor->csrMat->values->data().get(),
1121                         loTriFactor->csrMat->row_offsets->data().get(),
1122                         loTriFactor->csrMat->column_indices->data().get(),
1123                         loTriFactor->solveInfo,
1124                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1125 
1126   /* Then, solve U */
1127   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1128                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1129                         upTriFactor->csrMat->values->data().get(),
1130                         upTriFactor->csrMat->row_offsets->data().get(),
1131                         upTriFactor->csrMat->column_indices->data().get(),
1132                         upTriFactor->solveInfo,
1133                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1134   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1135 
1136   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1137   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1138                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1139                tempGPU->begin());
1140 
1141   /* Copy the temporary to the full solution. */
1142   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1143 
1144   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1145   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1146   ierr = WaitForGPU();CHKERRCUDA(ierr);
1147   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1152 {
1153   const PetscScalar                 *barray;
1154   PetscScalar                       *xarray;
1155   cusparseStatus_t                  stat;
1156   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1157   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1158   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1159   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1160   PetscErrorCode                    ierr;
1161 
1162   PetscFunctionBegin;
1163   /* Get the GPU pointers */
1164   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1165   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1166 
1167   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1168   /* First, solve L */
1169   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1170                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1171                         loTriFactor->csrMat->values->data().get(),
1172                         loTriFactor->csrMat->row_offsets->data().get(),
1173                         loTriFactor->csrMat->column_indices->data().get(),
1174                         loTriFactor->solveInfo,
1175                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1176 
1177   /* Next, solve U */
1178   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1179                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1180                         upTriFactor->csrMat->values->data().get(),
1181                         upTriFactor->csrMat->row_offsets->data().get(),
1182                         upTriFactor->csrMat->column_indices->data().get(),
1183                         upTriFactor->solveInfo,
1184                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1185   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1186 
1187   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1188   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1189   ierr = WaitForGPU();CHKERRCUDA(ierr);
1190   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1191   PetscFunctionReturn(0);
1192 }
1193 
1194 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1195 {
1196 
1197   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1198   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1199   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1200   PetscInt                     m = A->rmap->n,*ii,*ridx;
1201   PetscErrorCode               ierr;
1202   cusparseStatus_t             stat;
1203   cudaError_t                  err;
1204 
1205   PetscFunctionBegin;
1206   if (A->pinnedtocpu) PetscFunctionReturn(0);
1207   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1208     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1209     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1210       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1211       /* copy values only */
1212       matrix->values->assign(a->a, a->a+a->nz);
1213       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1214     } else {
1215       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1216       try {
1217         cusparsestruct->nonzerorow=0;
1218         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1219 
1220         if (a->compressedrow.use) {
1221           m    = a->compressedrow.nrows;
1222           ii   = a->compressedrow.i;
1223           ridx = a->compressedrow.rindex;
1224         } else {
1225           /* Forcing compressed row on the GPU */
1226           int k=0;
1227           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1228           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1229           ii[0]=0;
1230           for (int j = 0; j<m; j++) {
1231             if ((a->i[j+1]-a->i[j])>0) {
1232               ii[k]  = a->i[j];
1233               ridx[k]= j;
1234               k++;
1235             }
1236           }
1237           ii[cusparsestruct->nonzerorow] = a->nz;
1238           m = cusparsestruct->nonzerorow;
1239         }
1240 
1241         /* allocate space for the triangular factor information */
1242         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1243         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1244         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1245         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1246 
1247         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1248         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1249         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1250         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1251         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1252         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1253         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1254 
1255         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1256         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1257           /* set the matrix */
1258           CsrMatrix *matrix= new CsrMatrix;
1259           matrix->num_rows = m;
1260           matrix->num_cols = A->cmap->n;
1261           matrix->num_entries = a->nz;
1262           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1263           matrix->row_offsets->assign(ii, ii + m+1);
1264 
1265           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1266           matrix->column_indices->assign(a->j, a->j+a->nz);
1267 
1268           matrix->values = new THRUSTARRAY(a->nz);
1269           matrix->values->assign(a->a, a->a+a->nz);
1270 
1271           /* assign the pointer */
1272           matstruct->mat = matrix;
1273 
1274         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1275           CsrMatrix *matrix= new CsrMatrix;
1276           matrix->num_rows = m;
1277           matrix->num_cols = A->cmap->n;
1278           matrix->num_entries = a->nz;
1279           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1280           matrix->row_offsets->assign(ii, ii + m+1);
1281 
1282           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1283           matrix->column_indices->assign(a->j, a->j+a->nz);
1284 
1285           matrix->values = new THRUSTARRAY(a->nz);
1286           matrix->values->assign(a->a, a->a+a->nz);
1287 
1288           cusparseHybMat_t hybMat;
1289           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1290           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1291             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1292           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1293               matstruct->descr, matrix->values->data().get(),
1294               matrix->row_offsets->data().get(),
1295               matrix->column_indices->data().get(),
1296               hybMat, 0, partition);CHKERRCUDA(stat);
1297           /* assign the pointer */
1298           matstruct->mat = hybMat;
1299 
1300           if (matrix) {
1301             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1302             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1303             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1304             delete (CsrMatrix*)matrix;
1305           }
1306         }
1307 
1308         /* assign the compressed row indices */
1309         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1310         matstruct->cprowIndices->assign(ridx,ridx+m);
1311         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1312 
1313         /* assign the pointer */
1314         cusparsestruct->mat = matstruct;
1315 
1316         if (!a->compressedrow.use) {
1317           ierr = PetscFree(ii);CHKERRQ(ierr);
1318           ierr = PetscFree(ridx);CHKERRQ(ierr);
1319         }
1320         cusparsestruct->workVector = new THRUSTARRAY(m);
1321       } catch(char *ex) {
1322         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1323       }
1324       cusparsestruct->nonzerostate = A->nonzerostate;
1325     }
1326     ierr = WaitForGPU();CHKERRCUDA(ierr);
1327     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1328     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1329   }
1330   PetscFunctionReturn(0);
1331 }
1332 
1333 struct VecCUDAPlusEquals
1334 {
1335   template <typename Tuple>
1336   __host__ __device__
1337   void operator()(Tuple t)
1338   {
1339     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1340   }
1341 };
1342 
1343 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1344 {
1345   PetscErrorCode ierr;
1346 
1347   PetscFunctionBegin;
1348   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1349   PetscFunctionReturn(0);
1350 }
1351 
1352 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1353 {
1354   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1355   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1356   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1357   const PetscScalar            *xarray;
1358   PetscScalar                  *yarray;
1359   PetscErrorCode               ierr;
1360   cusparseStatus_t             stat;
1361 
1362   PetscFunctionBegin;
1363   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1364   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1365   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1366   if (!matstructT) {
1367     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1368     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1369   }
1370   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1371   ierr = VecSet(yy,0);CHKERRQ(ierr);
1372   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1373 
1374   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1375   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1376     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1377     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1378                              mat->num_rows, mat->num_cols,
1379                              mat->num_entries, matstructT->alpha, matstructT->descr,
1380                              mat->values->data().get(), mat->row_offsets->data().get(),
1381                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1382                              yarray);CHKERRCUDA(stat);
1383   } else {
1384     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1385     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1386                              matstructT->alpha, matstructT->descr, hybMat,
1387                              xarray, matstructT->beta_zero,
1388                              yarray);CHKERRCUDA(stat);
1389   }
1390   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1391   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1392   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1393   if (!cusparsestruct->stream) {
1394     ierr = WaitForGPU();CHKERRCUDA(ierr);
1395   }
1396   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1397   PetscFunctionReturn(0);
1398 }
1399 
1400 
1401 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1402 {
1403   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1404   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1405   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1406   const PetscScalar            *xarray;
1407   PetscScalar                  *zarray,*dptr,*beta;
1408   PetscErrorCode               ierr;
1409   cusparseStatus_t             stat;
1410 
1411   PetscFunctionBegin;
1412   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1413   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1414   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1415   try {
1416     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1417     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1418     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1419     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1420 
1421     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1422     /* csr_spmv is multiply add */
1423     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1424       /* here we need to be careful to set the number of rows in the multiply to the
1425          number of compressed rows in the matrix ... which is equivalent to the
1426          size of the workVector */
1427       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1428       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1429                                mat->num_rows, mat->num_cols,
1430                                mat->num_entries, matstruct->alpha, matstruct->descr,
1431                                mat->values->data().get(), mat->row_offsets->data().get(),
1432                                mat->column_indices->data().get(), xarray, beta,
1433                                dptr);CHKERRCUDA(stat);
1434     } else {
1435       if (cusparsestruct->workVector->size()) {
1436 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1437         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1438                                  matstruct->alpha, matstruct->descr, hybMat,
1439                                  xarray, beta,
1440                                  dptr);CHKERRCUDA(stat);
1441       }
1442     }
1443     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1444 
1445     if (yy) {
1446       if (dptr != zarray) {
1447         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1448       } else if (zz != yy) {
1449         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1450       }
1451     } else if (dptr != zarray) {
1452       ierr = VecSet(zz,0);CHKERRQ(ierr);
1453     }
1454     /* scatter the data from the temporary into the full vector with a += operation */
1455     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1456     if (dptr != zarray) {
1457       thrust::device_ptr<PetscScalar> zptr;
1458 
1459       zptr = thrust::device_pointer_cast(zarray);
1460       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1461                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1462                        VecCUDAPlusEquals());
1463     }
1464     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1465     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1466     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1467   } catch(char *ex) {
1468     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1469   }
1470   if (!yy) { /* MatMult */
1471     if (!cusparsestruct->stream) {
1472       ierr = WaitForGPU();CHKERRCUDA(ierr);
1473     }
1474   }
1475   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1476   PetscFunctionReturn(0);
1477 }
1478 
1479 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1480 {
1481   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1482   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1483   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1484   thrust::device_ptr<PetscScalar> zptr;
1485   const PetscScalar               *xarray;
1486   PetscScalar                     *zarray;
1487   PetscErrorCode                  ierr;
1488   cusparseStatus_t                stat;
1489 
1490   PetscFunctionBegin;
1491   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1492   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1493   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1494   if (!matstructT) {
1495     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1496     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1497   }
1498 
1499   try {
1500     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1501     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1502     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1503     zptr = thrust::device_pointer_cast(zarray);
1504 
1505     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1506     /* multiply add with matrix transpose */
1507     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1508       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1509       /* here we need to be careful to set the number of rows in the multiply to the
1510          number of compressed rows in the matrix ... which is equivalent to the
1511          size of the workVector */
1512       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1513                                mat->num_rows, mat->num_cols,
1514                                mat->num_entries, matstructT->alpha, matstructT->descr,
1515                                mat->values->data().get(), mat->row_offsets->data().get(),
1516                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1517                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1518     } else {
1519       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1520       if (cusparsestruct->workVector->size()) {
1521         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1522             matstructT->alpha, matstructT->descr, hybMat,
1523             xarray, matstructT->beta_zero,
1524             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1525       }
1526     }
1527 
1528     /* scatter the data from the temporary into the full vector with a += operation */
1529     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1530         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1531         VecCUDAPlusEquals());
1532     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1533     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1534     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1535 
1536   } catch(char *ex) {
1537     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1538   }
1539   ierr = WaitForGPU();CHKERRCUDA(ierr);
1540   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1545 {
1546   PetscErrorCode ierr;
1547 
1548   PetscFunctionBegin;
1549   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1550   if (A->factortype==MAT_FACTOR_NONE) {
1551     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1552   }
1553   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1554   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1555   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1556   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1557   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1558   PetscFunctionReturn(0);
1559 }
1560 
1561 /* --------------------------------------------------------------------------------*/
1562 /*@
1563    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1564    (the default parallel PETSc format). This matrix will ultimately pushed down
1565    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1566    assembly performance the user should preallocate the matrix storage by setting
1567    the parameter nz (or the array nnz).  By setting these parameters accurately,
1568    performance during matrix assembly can be increased by more than a factor of 50.
1569 
1570    Collective
1571 
1572    Input Parameters:
1573 +  comm - MPI communicator, set to PETSC_COMM_SELF
1574 .  m - number of rows
1575 .  n - number of columns
1576 .  nz - number of nonzeros per row (same for all rows)
1577 -  nnz - array containing the number of nonzeros in the various rows
1578          (possibly different for each row) or NULL
1579 
1580    Output Parameter:
1581 .  A - the matrix
1582 
1583    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1584    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1585    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1586 
1587    Notes:
1588    If nnz is given then nz is ignored
1589 
1590    The AIJ format (also called the Yale sparse matrix format or
1591    compressed row storage), is fully compatible with standard Fortran 77
1592    storage.  That is, the stored row and column indices can begin at
1593    either one (as in Fortran) or zero.  See the users' manual for details.
1594 
1595    Specify the preallocated storage with either nz or nnz (not both).
1596    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1597    allocation.  For large problems you MUST preallocate memory or you
1598    will get TERRIBLE performance, see the users' manual chapter on matrices.
1599 
1600    By default, this format uses inodes (identical nodes) when possible, to
1601    improve numerical efficiency of matrix-vector products and solves. We
1602    search for consecutive rows with the same nonzero structure, thereby
1603    reusing matrix information to achieve increased efficiency.
1604 
1605    Level: intermediate
1606 
1607 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1608 @*/
1609 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1610 {
1611   PetscErrorCode ierr;
1612 
1613   PetscFunctionBegin;
1614   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1615   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1616   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1617   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1622 {
1623   PetscErrorCode   ierr;
1624 
1625   PetscFunctionBegin;
1626   if (A->factortype==MAT_FACTOR_NONE) {
1627     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1628       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1629     }
1630   } else {
1631     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1632   }
1633   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1638 {
1639   PetscErrorCode ierr;
1640   Mat C;
1641   cusparseStatus_t stat;
1642   cusparseHandle_t handle=0;
1643 
1644   PetscFunctionBegin;
1645   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1646   C    = *B;
1647   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1648   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1649 
1650   /* inject CUSPARSE-specific stuff */
1651   if (C->factortype==MAT_FACTOR_NONE) {
1652     /* you cannot check the inode.use flag here since the matrix was just created.
1653        now build a GPU matrix data structure */
1654     C->spptr = new Mat_SeqAIJCUSPARSE;
1655     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1656     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1657     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1658     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1659     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1660     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1661     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1662     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1663   } else {
1664     /* NEXT, set the pointers to the triangular factors */
1665     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1666     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1667     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1668     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1669     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1670     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1671     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1672     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1673     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1674     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1675     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1676     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1677   }
1678 
1679   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1680   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1681   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1682   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1683   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1684   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1685   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1686   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1687 
1688   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1689 
1690   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1691 
1692   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1693   PetscFunctionReturn(0);
1694 }
1695 
1696 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1697 {
1698   PetscErrorCode ierr;
1699   cusparseStatus_t stat;
1700   cusparseHandle_t handle=0;
1701 
1702   PetscFunctionBegin;
1703   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1704   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1705 
1706   if (B->factortype==MAT_FACTOR_NONE) {
1707     /* you cannot check the inode.use flag here since the matrix was just created.
1708        now build a GPU matrix data structure */
1709     B->spptr = new Mat_SeqAIJCUSPARSE;
1710     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1711     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1712     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1713     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1714     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1715     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1716     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1717     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1718   } else {
1719     /* NEXT, set the pointers to the triangular factors */
1720     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1726     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1727     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1728     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1729     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1731   }
1732 
1733   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1734   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1735   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1736   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1737   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1738   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1739   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1740   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1741 
1742   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1743 
1744   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1745 
1746   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1747   PetscFunctionReturn(0);
1748 }
1749 
1750 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1751 {
1752   PetscErrorCode ierr;
1753 
1754   PetscFunctionBegin;
1755   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1756   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 /*MC
1761    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1762 
1763    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1764    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1765    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1766 
1767    Options Database Keys:
1768 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1769 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1770 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1771 
1772   Level: beginner
1773 
1774 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1775 M*/
1776 
1777 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1778 
1779 
1780 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1781 {
1782   PetscErrorCode ierr;
1783 
1784   PetscFunctionBegin;
1785   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1786   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1787   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1788   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1789   PetscFunctionReturn(0);
1790 }
1791 
1792 
1793 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1794 {
1795   cusparseStatus_t stat;
1796   cusparseHandle_t handle;
1797 
1798   PetscFunctionBegin;
1799   if (*cusparsestruct) {
1800     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1801     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1802     delete (*cusparsestruct)->workVector;
1803     if (handle = (*cusparsestruct)->handle) {
1804       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1805     }
1806     delete *cusparsestruct;
1807     *cusparsestruct = 0;
1808   }
1809   PetscFunctionReturn(0);
1810 }
1811 
1812 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1813 {
1814   PetscFunctionBegin;
1815   if (*mat) {
1816     delete (*mat)->values;
1817     delete (*mat)->column_indices;
1818     delete (*mat)->row_offsets;
1819     delete *mat;
1820     *mat = 0;
1821   }
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1826 {
1827   cusparseStatus_t stat;
1828   PetscErrorCode   ierr;
1829 
1830   PetscFunctionBegin;
1831   if (*trifactor) {
1832     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1833     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1834     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1835     delete *trifactor;
1836     *trifactor = 0;
1837   }
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1842 {
1843   CsrMatrix        *mat;
1844   cusparseStatus_t stat;
1845   cudaError_t      err;
1846 
1847   PetscFunctionBegin;
1848   if (*matstruct) {
1849     if ((*matstruct)->mat) {
1850       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1851         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1852         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1853       } else {
1854         mat = (CsrMatrix*)(*matstruct)->mat;
1855         CsrMatrix_Destroy(&mat);
1856       }
1857     }
1858     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1859     delete (*matstruct)->cprowIndices;
1860     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1861     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1862     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1863     delete *matstruct;
1864     *matstruct = 0;
1865   }
1866   PetscFunctionReturn(0);
1867 }
1868 
1869 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1870 {
1871   cusparseHandle_t handle;
1872   cusparseStatus_t stat;
1873 
1874   PetscFunctionBegin;
1875   if (*trifactors) {
1876     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1877     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1878     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1879     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1880     delete (*trifactors)->rpermIndices;
1881     delete (*trifactors)->cpermIndices;
1882     delete (*trifactors)->workVector;
1883     if (handle = (*trifactors)->handle) {
1884       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1885     }
1886     delete *trifactors;
1887     *trifactors = 0;
1888   }
1889   PetscFunctionReturn(0);
1890 }
1891 
1892