xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 9895aa37ac365bac650f6bd8bf977519f7222510)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 /* this is such a hack ... but I don't know of another way to pass this variable
18    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19    SpMV. Essentially, I need to use the same stream variable in two different
20    data structures. I do this by creating a single instance of that stream
21    and reuse it. */
22 cudaStream_t theBodyStream=0;
23 
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
32 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
33 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
36 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
37 
38 #undef __FUNCT__
39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
41 {
42   PetscFunctionBegin;
43   *type = MATSOLVERCUSPARSE;
44   PetscFunctionReturn(0);
45 }
46 
47 EXTERN_C_BEGIN
48 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
49 EXTERN_C_END
50 /*
51   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
52   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
53 
54    Level: beginner
55 */
56 
57 EXTERN_C_BEGIN
58 #undef __FUNCT__
59 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
60 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
61 {
62   PetscErrorCode ierr;
63 
64   PetscFunctionBegin;
65   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
66   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
67     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
68     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
69     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
70 
71     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
72     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
73   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
74   (*B)->factortype = ftype;
75   PetscFunctionReturn(0);
76 }
77 EXTERN_C_END
78 
79 EXTERN_C_BEGIN
80 #undef __FUNCT__
81 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
82 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
83 {
84   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
85 
86   PetscFunctionBegin;
87   switch (op) {
88   case MAT_CUSPARSE_MULT:
89     cusparseMat->format = format;
90     break;
91   case MAT_CUSPARSE_SOLVE:
92     cusparseMatSolveStorageFormat = format;
93     break;
94   case MAT_CUSPARSE_ALL:
95     cusparseMat->format           = format;
96     cusparseMatSolveStorageFormat = format;
97     break;
98   default:
99     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
100   }
101   PetscFunctionReturn(0);
102 }
103 EXTERN_C_END
104 
105 
106 /*@
107    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
108    operation. Only the MatMult operation can use different GPU storage formats
109    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
110    to build/install PETSc to use this package.
111 
112    Not Collective
113 
114    Input Parameters:
115 +  A - Matrix of type SEQAIJCUSPARSE
116 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
117 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
118 
119    Output Parameter:
120 
121    Level: intermediate
122 
123 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
124 @*/
125 #undef __FUNCT__
126 #define __FUNCT__ "MatCUSPARSESetFormat"
127 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
128 {
129   PetscErrorCode ierr;
130 
131   PetscFunctionBegin;
132   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
133   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
134   PetscFunctionReturn(0);
135 }
136 
137 #undef __FUNCT__
138 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
139 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
140 {
141   PetscErrorCode           ierr;
142   MatCUSPARSEStorageFormat format;
143   PetscBool                flg;
144 
145   PetscFunctionBegin;
146   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
147   ierr = PetscObjectOptionsBegin((PetscObject)A);
148   if (A->factortype==MAT_FACTOR_NONE) {
149     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
150                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
151     if (flg) {
152       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
153     }
154   } else {
155     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
156                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
157     if (flg) {
158       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
159     }
160   }
161   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
162                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
163   if (flg) {
164     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
165   }
166   ierr = PetscOptionsEnd();CHKERRQ(ierr);
167   PetscFunctionReturn(0);
168 
169 }
170 
171 #undef __FUNCT__
172 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
173 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
174 {
175   PetscErrorCode ierr;
176 
177   PetscFunctionBegin;
178   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
179 
180   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
181   PetscFunctionReturn(0);
182 }
183 
184 #undef __FUNCT__
185 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
186 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
187 {
188   PetscErrorCode ierr;
189 
190   PetscFunctionBegin;
191   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
192 
193   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
194   PetscFunctionReturn(0);
195 }
196 
197 #undef __FUNCT__
198 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
199 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
200 {
201   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
202   PetscInt                     n                   = A->rmap->n;
203   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
204   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
205   cusparseStatus_t             stat;
206   const PetscInt               *ai = a->i,*aj = a->j,*vi;
207   const MatScalar              *aa = a->a,*v;
208   PetscErrorCode               ierr;
209   PetscInt                     *AiLo, *AjLo;
210   PetscScalar                  *AALo;
211   PetscInt                     i,nz, nzLower, offset, rowOffset;
212 
213   PetscFunctionBegin;
214   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
215     try {
216       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
217       nzLower=n+ai[n]-ai[1];
218 
219       /* Allocate Space for the lower triangular matrix */
220       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
221       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
222       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
223 
224       /* Fill the lower triangular matrix */
225       AiLo[0]  = (PetscInt) 0;
226       AiLo[n]  = nzLower;
227       AjLo[0]  = (PetscInt) 0;
228       AALo[0]  = (MatScalar) 1.0;
229       v        = aa;
230       vi       = aj;
231       offset   = 1;
232       rowOffset= 1;
233       for (i=1; i<n; i++) {
234         nz = ai[i+1] - ai[i];
235         /* additional 1 for the term on the diagonal */
236         AiLo[i]    = rowOffset;
237         rowOffset += nz+1;
238 
239         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
240         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
241 
242         offset      += nz;
243         AjLo[offset] = (PetscInt) i;
244         AALo[offset] = (MatScalar) 1.0;
245         offset      += 1;
246 
247         v  += nz;
248         vi += nz;
249       }
250       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
251 
252       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
253       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
254       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
255 
256       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
257 
258       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
259       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
260       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
261     } catch(char *ex) {
262       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
263     }
264   }
265   PetscFunctionReturn(0);
266 }
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
270 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
271 {
272   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
273   PetscInt                     n                   = A->rmap->n;
274   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
275   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
276   cusparseStatus_t             stat;
277   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
278   const MatScalar              *aa = a->a,*v;
279   PetscInt                     *AiUp, *AjUp;
280   PetscScalar                  *AAUp;
281   PetscInt                     i,nz, nzUpper, offset;
282   PetscErrorCode               ierr;
283 
284   PetscFunctionBegin;
285   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
286     try {
287       /* next, figure out the number of nonzeros in the upper triangular matrix. */
288       nzUpper = adiag[0]-adiag[n];
289 
290       /* Allocate Space for the upper triangular matrix */
291       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
292       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
293       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
294 
295       /* Fill the upper triangular matrix */
296       AiUp[0]=(PetscInt) 0;
297       AiUp[n]=nzUpper;
298       offset = nzUpper;
299       for (i=n-1; i>=0; i--) {
300         v  = aa + adiag[i+1] + 1;
301         vi = aj + adiag[i+1] + 1;
302 
303         /* number of elements NOT on the diagonal */
304         nz = adiag[i] - adiag[i+1]-1;
305 
306         /* decrement the offset */
307         offset -= (nz+1);
308 
309         /* first, set the diagonal elements */
310         AjUp[offset] = (PetscInt) i;
311         AAUp[offset] = 1./v[nz];
312         AiUp[i]      = AiUp[i+1] - (nz+1);
313 
314         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
315         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
316       }
317       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
318 
319       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
320       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
321       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
322 
323       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
324 
325       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
326       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
327       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
328     } catch(char *ex) {
329       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
330     }
331   }
332   PetscFunctionReturn(0);
333 }
334 
335 #undef __FUNCT__
336 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
337 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
338 {
339   PetscErrorCode               ierr;
340   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
341   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
342   IS                           isrow               = a->row,iscol = a->icol;
343   PetscBool                    row_identity,col_identity;
344   const PetscInt               *r,*c;
345   PetscInt                     n = A->rmap->n;
346 
347   PetscFunctionBegin;
348   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
349   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
350 
351   cusparseTriFactors->tempvec = new CUSPARRAY;
352   cusparseTriFactors->tempvec->resize(n);
353 
354   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
355   /*lower triangular indices */
356   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
357   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
358   if (!row_identity) {
359     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
360   }
361   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
362 
363   /*upper triangular indices */
364   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
365   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
366   if (!col_identity) {
367     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
368   }
369   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
370   PetscFunctionReturn(0);
371 }
372 
373 #undef __FUNCT__
374 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
375 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
376 {
377   PetscErrorCode ierr;
378   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
379   IS             isrow = b->row,iscol = b->col;
380   PetscBool      row_identity,col_identity;
381 
382   PetscFunctionBegin;
383   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
384   /* determine which version of MatSolve needs to be used. */
385   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
386   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
387   if (row_identity && col_identity) {
388     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
389     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
390   } else {
391     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
392     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
393   }
394 
395   /* get the triangular factors */
396   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 
401 #undef __FUNCT__
402 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
403 PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
404 {
405   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
406   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
407   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
408   cusparseStatus_t stat;
409   PetscFunctionBegin;
410   stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle,
411                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
412                                               CUSPARSE_FILL_MODE_UPPER,
413                                               TRANSPOSE);CHKERRCUSP(stat);
414   stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
415 
416   stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle,
417                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
418                                               CUSPARSE_FILL_MODE_LOWER,
419                                               TRANSPOSE);CHKERRCUSP(stat);
420   stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
421   PetscFunctionReturn(0);
422 }
423 
424 #undef __FUNCT__
425 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
426 PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
427 {
428   PetscErrorCode     ierr;
429   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
430   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
431   cusparseStatus_t stat;
432   PetscFunctionBegin;
433   stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat);
434   ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr);
435   PetscFunctionReturn(0);
436 }
437 
438 #undef __FUNCT__
439 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
440 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
441 {
442   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
443   PetscErrorCode ierr;
444   CUSPARRAY      *xGPU, *bGPU;
445   cusparseStatus_t stat;
446   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
447   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
448   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
449   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
450 
451   PetscFunctionBegin;
452   /* Analyze the matrix ... on the fly */
453   if (!cusparseTriFactors->hasTranspose) {
454     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
455     cusparseTriFactors->hasTranspose=PETSC_TRUE;
456   }
457 
458   /* Get the GPU pointers */
459   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
460   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
461 
462   /* solve with reordering */
463   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
464   stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
465   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
466   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
467 
468   /* restore */
469   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
470   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
471   ierr = WaitForGPU();CHKERRCUSP(ierr);
472   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
473   PetscFunctionReturn(0);
474 }
475 
476 #undef __FUNCT__
477 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
478 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
479 {
480   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
481   PetscErrorCode    ierr;
482   CUSPARRAY         *xGPU, *bGPU;
483   cusparseStatus_t stat;
484   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
485   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
486   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
487   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
488 
489   PetscFunctionBegin;
490   /* Analyze the matrix ... on the fly */
491   if (!cusparseTriFactors->hasTranspose) {
492     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
493     cusparseTriFactors->hasTranspose=PETSC_TRUE;
494   }
495 
496   /* Get the GPU pointers */
497   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
498   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
499 
500   /* solve */
501   stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
502   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
503 
504   /* restore */
505   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
506   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
507   ierr = WaitForGPU();CHKERRCUSP(ierr);
508   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
509   PetscFunctionReturn(0);
510 }
511 
512 
513 #undef __FUNCT__
514 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
515 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
516 {
517   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
518   PetscErrorCode               ierr;
519   CUSPARRAY                    *xGPU, *bGPU;
520   cusparseStatus_t             stat;
521   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
522   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
523   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
524   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
525 
526   PetscFunctionBegin;
527   /* Get the GPU pointers */
528   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
529   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
530 
531   /* solve with reordering */
532   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
533   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
534   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
535   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
536 
537   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
538   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
539   ierr = WaitForGPU();CHKERRCUSP(ierr);
540   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
541   PetscFunctionReturn(0);
542 }
543 
544 
545 
546 
547 #undef __FUNCT__
548 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
549 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
550 {
551   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
552   PetscErrorCode               ierr;
553   CUSPARRAY                    *xGPU, *bGPU;
554   cusparseStatus_t             stat;
555   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
556   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
557   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
558   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
559 
560   PetscFunctionBegin;
561   /* Get the GPU pointers */
562   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
563   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
564 
565   /* solve */
566   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
567   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
568 
569   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
570   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
571   ierr = WaitForGPU();CHKERRCUSP(ierr);
572   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 
576 #undef __FUNCT__
577 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
578 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
579 {
580 
581   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
582   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
583   PetscInt           m            = A->rmap->n,*ii,*ridx;
584   PetscErrorCode     ierr;
585 
586 
587   PetscFunctionBegin;
588   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
589     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
590     /*
591       It may be possible to reuse nonzero structure with new matrix values but
592       for simplicity and insured correctness we delete and build a new matrix on
593       the GPU. Likely a very small performance hit.
594     */
595     if (cusparseMat->mat) {
596       try {
597         delete cusparseMat->mat;
598         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
599 
600       } catch(char *ex) {
601         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
602       }
603     }
604     try {
605       cusparseMat->nonzerorow=0;
606       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
607 
608       if (a->compressedrow.use) {
609         m    = a->compressedrow.nrows;
610         ii   = a->compressedrow.i;
611         ridx = a->compressedrow.rindex;
612       } else {
613         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
614         int k=0;
615         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
616         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
617         ii[0]=0;
618         for (int j = 0; j<m; j++) {
619           if ((a->i[j+1]-a->i[j])>0) {
620             ii[k]  = a->i[j];
621             ridx[k]= j;
622             k++;
623           }
624         }
625         ii[cusparseMat->nonzerorow] = a->nz;
626 
627         m = cusparseMat->nonzerorow;
628       }
629 
630       /* Build our matrix ... first determine the GPU storage type */
631       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
632 
633       /* Create the streams and events (if desired).  */
634       PetscMPIInt size;
635       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
636       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
637 
638       /* FILL MODE UPPER is irrelevant */
639       cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
640 
641       /* lastly, build the matrix */
642       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
643       cusparseMat->mat->setCPRowIndices(ridx, m);
644       if (!a->compressedrow.use) {
645         ierr = PetscFree(ii);CHKERRQ(ierr);
646         ierr = PetscFree(ridx);CHKERRQ(ierr);
647       }
648       cusparseMat->tempvec = new CUSPARRAY;
649       cusparseMat->tempvec->resize(m);
650     } catch(char *ex) {
651       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
652     }
653     ierr = WaitForGPU();CHKERRCUSP(ierr);
654 
655     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
656 
657     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
658   }
659   PetscFunctionReturn(0);
660 }
661 
662 #undef __FUNCT__
663 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
664 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
665 {
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   if (right) {
670     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
671     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
672     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
673     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
674     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
675   }
676   if (left) {
677     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
678     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
679     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
680     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
681     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
682   }
683   PetscFunctionReturn(0);
684 }
685 
686 #undef __FUNCT__
687 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
688 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
689 {
690   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
691   PetscErrorCode     ierr;
692   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
693   CUSPARRAY          *xarray,*yarray;
694 
695   PetscFunctionBegin;
696   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
697      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
698   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
699   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
700   try {
701     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
702   } catch (char *ex) {
703     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
704   }
705   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
706   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
707   if (!cusparseMat->mat->hasNonZeroStream()) {
708     ierr = WaitForGPU();CHKERRCUSP(ierr);
709   }
710   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
711   PetscFunctionReturn(0);
712 }
713 
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
717 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
718 {
719   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
720   PetscErrorCode     ierr;
721   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
722   CUSPARRAY          *xarray,*yarray;
723 
724   PetscFunctionBegin;
725   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
726      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
727   if (!cusparseMat->hasTranspose) {
728     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
729     cusparseMat->hasTranspose=PETSC_TRUE;
730   }
731   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
732   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
733   try {
734     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
735   } catch (char *ex) {
736     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
737   }
738   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
739   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
740   if (!cusparseMat->mat->hasNonZeroStream()) {
741     ierr = WaitForGPU();CHKERRCUSP(ierr);
742   }
743   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
744   PetscFunctionReturn(0);
745 }
746 
747 #undef __FUNCT__
748 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
749 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
750 {
751   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
752   PetscErrorCode     ierr;
753   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
754   CUSPARRAY          *xarray,*yarray,*zarray;
755 
756   PetscFunctionBegin;
757   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
758      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
759   try {
760     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
761     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
762     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
763     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
764 
765     /* multiply add */
766     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
767 
768     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
769     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
770     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
771 
772   } catch(char *ex) {
773     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
774   }
775   ierr = WaitForGPU();CHKERRCUSP(ierr);
776   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
777   PetscFunctionReturn(0);
778 }
779 
780 #undef __FUNCT__
781 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
782 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
783 {
784   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
785   PetscErrorCode     ierr;
786   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
787   CUSPARRAY          *xarray,*yarray,*zarray;
788 
789   PetscFunctionBegin;
790   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
791      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
792   if (!cusparseMat->hasTranspose) {
793     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
794     cusparseMat->hasTranspose=PETSC_TRUE;
795   }
796   try {
797     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
798     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
799     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
800     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
801 
802     /* multiply add with matrix transpose */
803     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
804 
805     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
806     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
807     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
808 
809   } catch(char *ex) {
810     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
811   }
812   ierr = WaitForGPU();CHKERRCUSP(ierr);
813   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
814   PetscFunctionReturn(0);
815 }
816 
817 #undef __FUNCT__
818 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
819 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
820 {
821   PetscErrorCode ierr;
822 
823   PetscFunctionBegin;
824   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
825   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
826   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
827   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
828   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
829   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
830   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
831   PetscFunctionReturn(0);
832 }
833 
834 /* --------------------------------------------------------------------------------*/
835 #undef __FUNCT__
836 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
837 /*@
838    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
839    (the default parallel PETSc format). This matrix will ultimately pushed down
840    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
841    assembly performance the user should preallocate the matrix storage by setting
842    the parameter nz (or the array nnz).  By setting these parameters accurately,
843    performance during matrix assembly can be increased by more than a factor of 50.
844 
845    Collective on MPI_Comm
846 
847    Input Parameters:
848 +  comm - MPI communicator, set to PETSC_COMM_SELF
849 .  m - number of rows
850 .  n - number of columns
851 .  nz - number of nonzeros per row (same for all rows)
852 -  nnz - array containing the number of nonzeros in the various rows
853          (possibly different for each row) or NULL
854 
855    Output Parameter:
856 .  A - the matrix
857 
858    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
859    MatXXXXSetPreallocation() paradgm instead of this routine directly.
860    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
861 
862    Notes:
863    If nnz is given then nz is ignored
864 
865    The AIJ format (also called the Yale sparse matrix format or
866    compressed row storage), is fully compatible with standard Fortran 77
867    storage.  That is, the stored row and column indices can begin at
868    either one (as in Fortran) or zero.  See the users' manual for details.
869 
870    Specify the preallocated storage with either nz or nnz (not both).
871    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
872    allocation.  For large problems you MUST preallocate memory or you
873    will get TERRIBLE performance, see the users' manual chapter on matrices.
874 
875    By default, this format uses inodes (identical nodes) when possible, to
876    improve numerical efficiency of matrix-vector products and solves. We
877    search for consecutive rows with the same nonzero structure, thereby
878    reusing matrix information to achieve increased efficiency.
879 
880    Level: intermediate
881 
882 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
883 @*/
884 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
885 {
886   PetscErrorCode ierr;
887 
888   PetscFunctionBegin;
889   ierr = MatCreate(comm,A);CHKERRQ(ierr);
890   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
891   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
892   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
893   PetscFunctionReturn(0);
894 }
895 
896 #undef __FUNCT__
897 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
898 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
899 {
900   PetscErrorCode     ierr;
901   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
902 
903   PetscFunctionBegin;
904   if (A->factortype==MAT_FACTOR_NONE) {
905     try {
906       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
907         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
908       }
909       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
910       delete cusparseMat;
911       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
912     } catch(char *ex) {
913       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
914     }
915   } else {
916     /* The triangular factors */
917     try {
918       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
919       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
920       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
921       delete (GPU_Matrix_Ifc*) cusparseMatLo;
922       delete (GPU_Matrix_Ifc*) cusparseMatUp;
923       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
924       delete cusparseTriFactors;
925     } catch(char *ex) {
926       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
927     }
928   }
929   if (MAT_cusparseHandle) {
930     cusparseStatus_t stat;
931     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
932 
933     MAT_cusparseHandle=0;
934   }
935   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
936   A->spptr = 0;
937 
938   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
939   PetscFunctionReturn(0);
940 }
941 
942 EXTERN_C_BEGIN
943 #undef __FUNCT__
944 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
945 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
946 {
947   PetscErrorCode ierr;
948 
949   PetscFunctionBegin;
950   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
951   if (B->factortype==MAT_FACTOR_NONE) {
952     /* you cannot check the inode.use flag here since the matrix was just created.
953        now build a GPU matrix data structure */
954     B->spptr = new Mat_SeqAIJCUSPARSE;
955 
956     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
957     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
958     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
959     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
960   } else {
961     /* NEXT, set the pointers to the triangular factors */
962     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
963 
964     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
965     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
966     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
967     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
968     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
969   }
970   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
971   if (!MAT_cusparseHandle) {
972     cusparseStatus_t stat;
973     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
974   }
975   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
976      default cusparse tri solve. Note the difference with the implementation in
977      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
978   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
979 
980   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
981   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
982   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
983   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
984   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
985   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
986   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
987   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
988 
989   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
990 
991   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
992 
993   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 EXTERN_C_END
997 
998 /*M
999    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1000 
1001    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1002    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1003    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1004    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1005    the different GPU storage formats.
1006 
1007    Options Database Keys:
1008 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1009 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1010 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1011 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1012 
1013   Level: beginner
1014 
1015 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1016 M*/
1017