xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 99e0435ecafc70c73385656bd0183f8795c2793e)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 /* this is such a hack ... but I don't know of another way to pass this variable
18    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19    SpMV. Essentially, I need to use the same stream variable in two different
20    data structures. I do this by creating a single instance of that stream
21    and reuse it. */
22 cudaStream_t theBodyStream=0;
23 
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
32 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
33 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
36 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
37 
38 #undef __FUNCT__
39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
41 {
42   PetscFunctionBegin;
43   *type = MATSOLVERCUSPARSE;
44   PetscFunctionReturn(0);
45 }
46 
47 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
48 
49 /*MC
50   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices on a single GPU of type,
51   seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported algorithms are ILU(k) and LU. ICC(k) and
52   Cholesky will be supported in future versions. This class does NOT support direct solver operations.
53 
54   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE
55 
56   Consult CUSPARSE documentation for more information about the matrix storage formats which correspond to the options
57   database keys below.
58 
59   Options Database Keys:
60 .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
61 
62   Level: beginner
63 
64 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
65 M*/
66 
67 #undef __FUNCT__
68 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
69 PETSC_EXTERN_C PETSC_EXTERN_C PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
70 {
71   PetscErrorCode ierr;
72 
73   PetscFunctionBegin;
74   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
75   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
76     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
77     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
78     ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
79 
80     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
81     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
82   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
83   (*B)->factortype = ftype;
84   PetscFunctionReturn(0);
85 }
86 
87 #undef __FUNCT__
88 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
89 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
90 {
91   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
92 
93   PetscFunctionBegin;
94   switch (op) {
95   case MAT_CUSPARSE_MULT:
96     cusparseMat->format = format;
97     break;
98   case MAT_CUSPARSE_SOLVE:
99     cusparseMatSolveStorageFormat = format;
100     break;
101   case MAT_CUSPARSE_ALL:
102     cusparseMat->format           = format;
103     cusparseMatSolveStorageFormat = format;
104     break;
105   default:
106     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
107   }
108   PetscFunctionReturn(0);
109 }
110 
111 
112 /*@
113    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
114    operation. Only the MatMult operation can use different GPU storage formats
115    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
116    to build/install PETSc to use this package.
117 
118    Not Collective
119 
120    Input Parameters:
121 +  A - Matrix of type SEQAIJCUSPARSE
122 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
123 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
124 
125    Output Parameter:
126 
127    Level: intermediate
128 
129 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
130 @*/
131 #undef __FUNCT__
132 #define __FUNCT__ "MatCUSPARSESetFormat"
133 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
134 {
135   PetscErrorCode ierr;
136 
137   PetscFunctionBegin;
138   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
139   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
140   PetscFunctionReturn(0);
141 }
142 
143 #undef __FUNCT__
144 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
145 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
146 {
147   PetscErrorCode           ierr;
148   MatCUSPARSEStorageFormat format;
149   PetscBool                flg;
150 
151   PetscFunctionBegin;
152   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
153   ierr = PetscObjectOptionsBegin((PetscObject)A);
154   if (A->factortype==MAT_FACTOR_NONE) {
155     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
156                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
157     if (flg) {
158       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
159     }
160   } else {
161     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
162                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
163     if (flg) {
164       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
165     }
166   }
167   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
168                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
169   if (flg) {
170     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
171   }
172   ierr = PetscOptionsEnd();CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
179 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
180 {
181   PetscErrorCode ierr;
182 
183   PetscFunctionBegin;
184   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
185 
186   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
187   PetscFunctionReturn(0);
188 }
189 
190 #undef __FUNCT__
191 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
192 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
193 {
194   PetscErrorCode ierr;
195 
196   PetscFunctionBegin;
197   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
198 
199   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNCT__
204 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
205 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
206 {
207   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
208   PetscInt                     n                   = A->rmap->n;
209   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
210   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
211   cusparseStatus_t             stat;
212   const PetscInt               *ai = a->i,*aj = a->j,*vi;
213   const MatScalar              *aa = a->a,*v;
214   PetscErrorCode               ierr;
215   PetscInt                     *AiLo, *AjLo;
216   PetscScalar                  *AALo;
217   PetscInt                     i,nz, nzLower, offset, rowOffset;
218 
219   PetscFunctionBegin;
220   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
221     try {
222       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
223       nzLower=n+ai[n]-ai[1];
224 
225       /* Allocate Space for the lower triangular matrix */
226       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
227       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
228       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
229 
230       /* Fill the lower triangular matrix */
231       AiLo[0]  = (PetscInt) 0;
232       AiLo[n]  = nzLower;
233       AjLo[0]  = (PetscInt) 0;
234       AALo[0]  = (MatScalar) 1.0;
235       v        = aa;
236       vi       = aj;
237       offset   = 1;
238       rowOffset= 1;
239       for (i=1; i<n; i++) {
240         nz = ai[i+1] - ai[i];
241         /* additional 1 for the term on the diagonal */
242         AiLo[i]    = rowOffset;
243         rowOffset += nz+1;
244 
245         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
246         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
247 
248         offset      += nz;
249         AjLo[offset] = (PetscInt) i;
250         AALo[offset] = (MatScalar) 1.0;
251         offset      += 1;
252 
253         v  += nz;
254         vi += nz;
255       }
256       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
257 
258       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
259       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
260       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
261 
262       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
263 
264       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
265       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
266       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
267     } catch(char *ex) {
268       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
269     }
270   }
271   PetscFunctionReturn(0);
272 }
273 
274 #undef __FUNCT__
275 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
276 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
277 {
278   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
279   PetscInt                     n                   = A->rmap->n;
280   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
281   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
282   cusparseStatus_t             stat;
283   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
284   const MatScalar              *aa = a->a,*v;
285   PetscInt                     *AiUp, *AjUp;
286   PetscScalar                  *AAUp;
287   PetscInt                     i,nz, nzUpper, offset;
288   PetscErrorCode               ierr;
289 
290   PetscFunctionBegin;
291   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
292     try {
293       /* next, figure out the number of nonzeros in the upper triangular matrix. */
294       nzUpper = adiag[0]-adiag[n];
295 
296       /* Allocate Space for the upper triangular matrix */
297       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
298       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
299       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
300 
301       /* Fill the upper triangular matrix */
302       AiUp[0]=(PetscInt) 0;
303       AiUp[n]=nzUpper;
304       offset = nzUpper;
305       for (i=n-1; i>=0; i--) {
306         v  = aa + adiag[i+1] + 1;
307         vi = aj + adiag[i+1] + 1;
308 
309         /* number of elements NOT on the diagonal */
310         nz = adiag[i] - adiag[i+1]-1;
311 
312         /* decrement the offset */
313         offset -= (nz+1);
314 
315         /* first, set the diagonal elements */
316         AjUp[offset] = (PetscInt) i;
317         AAUp[offset] = 1./v[nz];
318         AiUp[i]      = AiUp[i+1] - (nz+1);
319 
320         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
321         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
322       }
323       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
324 
325       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
326       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
327       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
328 
329       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
330 
331       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
332       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
333       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
334     } catch(char *ex) {
335       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
336     }
337   }
338   PetscFunctionReturn(0);
339 }
340 
341 #undef __FUNCT__
342 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
343 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
344 {
345   PetscErrorCode               ierr;
346   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
347   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
348   IS                           isrow               = a->row,iscol = a->icol;
349   PetscBool                    row_identity,col_identity;
350   const PetscInt               *r,*c;
351   PetscInt                     n = A->rmap->n;
352 
353   PetscFunctionBegin;
354   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
355   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
356 
357   cusparseTriFactors->tempvec = new CUSPARRAY;
358   cusparseTriFactors->tempvec->resize(n);
359 
360   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
361   /*lower triangular indices */
362   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
363   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
364   if (!row_identity) {
365     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
366   }
367   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
368 
369   /*upper triangular indices */
370   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
371   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
372   if (!col_identity) {
373     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
374   }
375   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
376   PetscFunctionReturn(0);
377 }
378 
379 #undef __FUNCT__
380 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
381 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
382 {
383   PetscErrorCode ierr;
384   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
385   IS             isrow = b->row,iscol = b->col;
386   PetscBool      row_identity,col_identity;
387 
388   PetscFunctionBegin;
389   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
390   /* determine which version of MatSolve needs to be used. */
391   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
392   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
393   if (row_identity && col_identity) {
394     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
395     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
396   } else {
397     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
398     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
399   }
400 
401   /* get the triangular factors */
402   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
403   PetscFunctionReturn(0);
404 }
405 
406 
407 #undef __FUNCT__
408 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
409 PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
410 {
411   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
412   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
413   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
414   cusparseStatus_t stat;
415   PetscFunctionBegin;
416   stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle,
417                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
418                                               CUSPARSE_FILL_MODE_UPPER,
419                                               TRANSPOSE);CHKERRCUSP(stat);
420   stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
421 
422   stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle,
423                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
424                                               CUSPARSE_FILL_MODE_LOWER,
425                                               TRANSPOSE);CHKERRCUSP(stat);
426   stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
432 PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
433 {
434   PetscErrorCode     ierr;
435   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
436   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
437   cusparseStatus_t stat;
438   PetscFunctionBegin;
439   stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat);
440   ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr);
441   PetscFunctionReturn(0);
442 }
443 
444 #undef __FUNCT__
445 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
446 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
447 {
448   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
449   PetscErrorCode ierr;
450   CUSPARRAY      *xGPU, *bGPU;
451   cusparseStatus_t stat;
452   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
453   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
454   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
455   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
456 
457   PetscFunctionBegin;
458   /* Analyze the matrix ... on the fly */
459   if (!cusparseTriFactors->hasTranspose) {
460     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
461     cusparseTriFactors->hasTranspose=PETSC_TRUE;
462   }
463 
464   /* Get the GPU pointers */
465   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
466   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
467 
468   /* solve with reordering */
469   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
470   stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
471   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
472   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
473 
474   /* restore */
475   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
476   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
477   ierr = WaitForGPU();CHKERRCUSP(ierr);
478   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
479   PetscFunctionReturn(0);
480 }
481 
482 #undef __FUNCT__
483 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
484 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
485 {
486   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
487   PetscErrorCode    ierr;
488   CUSPARRAY         *xGPU, *bGPU;
489   cusparseStatus_t stat;
490   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
491   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
492   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
493   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
494 
495   PetscFunctionBegin;
496   /* Analyze the matrix ... on the fly */
497   if (!cusparseTriFactors->hasTranspose) {
498     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
499     cusparseTriFactors->hasTranspose=PETSC_TRUE;
500   }
501 
502   /* Get the GPU pointers */
503   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
504   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
505 
506   /* solve */
507   stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
508   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
509 
510   /* restore */
511   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
512   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
513   ierr = WaitForGPU();CHKERRCUSP(ierr);
514   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
515   PetscFunctionReturn(0);
516 }
517 
518 
519 #undef __FUNCT__
520 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
521 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
522 {
523   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
524   PetscErrorCode               ierr;
525   CUSPARRAY                    *xGPU, *bGPU;
526   cusparseStatus_t             stat;
527   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
528   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
529   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
530   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
531 
532   PetscFunctionBegin;
533   /* Get the GPU pointers */
534   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
535   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
536 
537   /* solve with reordering */
538   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
539   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
540   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
541   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
542 
543   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
544   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
545   ierr = WaitForGPU();CHKERRCUSP(ierr);
546   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
547   PetscFunctionReturn(0);
548 }
549 
550 
551 
552 
553 #undef __FUNCT__
554 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
555 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
556 {
557   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
558   PetscErrorCode               ierr;
559   CUSPARRAY                    *xGPU, *bGPU;
560   cusparseStatus_t             stat;
561   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
562   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
563   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
564   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
565 
566   PetscFunctionBegin;
567   /* Get the GPU pointers */
568   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
569   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
570 
571   /* solve */
572   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
573   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
574 
575   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
576   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
577   ierr = WaitForGPU();CHKERRCUSP(ierr);
578   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
579   PetscFunctionReturn(0);
580 }
581 
582 #undef __FUNCT__
583 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
584 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
585 {
586 
587   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
588   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
589   PetscInt           m            = A->rmap->n,*ii,*ridx;
590   PetscErrorCode     ierr;
591 
592 
593   PetscFunctionBegin;
594   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
595     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
596     /*
597       It may be possible to reuse nonzero structure with new matrix values but
598       for simplicity and insured correctness we delete and build a new matrix on
599       the GPU. Likely a very small performance hit.
600     */
601     if (cusparseMat->mat) {
602       try {
603         delete cusparseMat->mat;
604         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
605 
606       } catch(char *ex) {
607         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
608       }
609     }
610     try {
611       cusparseMat->nonzerorow=0;
612       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
613 
614       if (a->compressedrow.use) {
615         m    = a->compressedrow.nrows;
616         ii   = a->compressedrow.i;
617         ridx = a->compressedrow.rindex;
618       } else {
619         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
620         int k=0;
621         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
622         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
623         ii[0]=0;
624         for (int j = 0; j<m; j++) {
625           if ((a->i[j+1]-a->i[j])>0) {
626             ii[k]  = a->i[j];
627             ridx[k]= j;
628             k++;
629           }
630         }
631         ii[cusparseMat->nonzerorow] = a->nz;
632 
633         m = cusparseMat->nonzerorow;
634       }
635 
636       /* Build our matrix ... first determine the GPU storage type */
637       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
638 
639       /* Create the streams and events (if desired).  */
640       PetscMPIInt size;
641       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
642       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
643 
644       /* FILL MODE UPPER is irrelevant */
645       cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
646 
647       /* lastly, build the matrix */
648       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
649       cusparseMat->mat->setCPRowIndices(ridx, m);
650       if (!a->compressedrow.use) {
651         ierr = PetscFree(ii);CHKERRQ(ierr);
652         ierr = PetscFree(ridx);CHKERRQ(ierr);
653       }
654       cusparseMat->tempvec = new CUSPARRAY;
655       cusparseMat->tempvec->resize(m);
656     } catch(char *ex) {
657       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
658     }
659     ierr = WaitForGPU();CHKERRCUSP(ierr);
660 
661     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
662 
663     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
664   }
665   PetscFunctionReturn(0);
666 }
667 
668 #undef __FUNCT__
669 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
670 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
671 {
672   PetscErrorCode ierr;
673 
674   PetscFunctionBegin;
675   if (right) {
676     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
677     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
678     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
679     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
680     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
681   }
682   if (left) {
683     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
684     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
685     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
686     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
687     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
688   }
689   PetscFunctionReturn(0);
690 }
691 
692 #undef __FUNCT__
693 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
694 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
695 {
696   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
697   PetscErrorCode     ierr;
698   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
699   CUSPARRAY          *xarray,*yarray;
700 
701   PetscFunctionBegin;
702   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
703      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
704   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
705   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
706   try {
707     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
708   } catch (char *ex) {
709     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
710   }
711   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
712   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
713   if (!cusparseMat->mat->hasNonZeroStream()) {
714     ierr = WaitForGPU();CHKERRCUSP(ierr);
715   }
716   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
717   PetscFunctionReturn(0);
718 }
719 
720 
721 #undef __FUNCT__
722 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
723 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
724 {
725   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
726   PetscErrorCode     ierr;
727   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
728   CUSPARRAY          *xarray,*yarray;
729 
730   PetscFunctionBegin;
731   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
732      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
733   if (!cusparseMat->hasTranspose) {
734     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
735     cusparseMat->hasTranspose=PETSC_TRUE;
736   }
737   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
738   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
739   try {
740     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
741   } catch (char *ex) {
742     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
743   }
744   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
745   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
746   if (!cusparseMat->mat->hasNonZeroStream()) {
747     ierr = WaitForGPU();CHKERRCUSP(ierr);
748   }
749   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
750   PetscFunctionReturn(0);
751 }
752 
753 #undef __FUNCT__
754 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
755 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
756 {
757   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
758   PetscErrorCode     ierr;
759   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
760   CUSPARRAY          *xarray,*yarray,*zarray;
761 
762   PetscFunctionBegin;
763   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
764      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
765   try {
766     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
767     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
768     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
769     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
770 
771     /* multiply add */
772     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
773 
774     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
775     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
776     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
777 
778   } catch(char *ex) {
779     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
780   }
781   ierr = WaitForGPU();CHKERRCUSP(ierr);
782   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
783   PetscFunctionReturn(0);
784 }
785 
786 #undef __FUNCT__
787 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
788 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
789 {
790   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
791   PetscErrorCode     ierr;
792   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
793   CUSPARRAY          *xarray,*yarray,*zarray;
794 
795   PetscFunctionBegin;
796   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
797      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
798   if (!cusparseMat->hasTranspose) {
799     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
800     cusparseMat->hasTranspose=PETSC_TRUE;
801   }
802   try {
803     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
804     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
805     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
806     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
807 
808     /* multiply add with matrix transpose */
809     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
810 
811     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
812     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
813     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
814 
815   } catch(char *ex) {
816     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
817   }
818   ierr = WaitForGPU();CHKERRCUSP(ierr);
819   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
820   PetscFunctionReturn(0);
821 }
822 
823 #undef __FUNCT__
824 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
825 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
826 {
827   PetscErrorCode ierr;
828 
829   PetscFunctionBegin;
830   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
831   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
832   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
833   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
834   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
835   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
836   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
837   PetscFunctionReturn(0);
838 }
839 
840 /* --------------------------------------------------------------------------------*/
841 #undef __FUNCT__
842 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
843 /*@
844    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
845    (the default parallel PETSc format). This matrix will ultimately pushed down
846    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
847    assembly performance the user should preallocate the matrix storage by setting
848    the parameter nz (or the array nnz).  By setting these parameters accurately,
849    performance during matrix assembly can be increased by more than a factor of 50.
850 
851    Collective on MPI_Comm
852 
853    Input Parameters:
854 +  comm - MPI communicator, set to PETSC_COMM_SELF
855 .  m - number of rows
856 .  n - number of columns
857 .  nz - number of nonzeros per row (same for all rows)
858 -  nnz - array containing the number of nonzeros in the various rows
859          (possibly different for each row) or NULL
860 
861    Output Parameter:
862 .  A - the matrix
863 
864    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
865    MatXXXXSetPreallocation() paradgm instead of this routine directly.
866    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
867 
868    Notes:
869    If nnz is given then nz is ignored
870 
871    The AIJ format (also called the Yale sparse matrix format or
872    compressed row storage), is fully compatible with standard Fortran 77
873    storage.  That is, the stored row and column indices can begin at
874    either one (as in Fortran) or zero.  See the users' manual for details.
875 
876    Specify the preallocated storage with either nz or nnz (not both).
877    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
878    allocation.  For large problems you MUST preallocate memory or you
879    will get TERRIBLE performance, see the users' manual chapter on matrices.
880 
881    By default, this format uses inodes (identical nodes) when possible, to
882    improve numerical efficiency of matrix-vector products and solves. We
883    search for consecutive rows with the same nonzero structure, thereby
884    reusing matrix information to achieve increased efficiency.
885 
886    Level: intermediate
887 
888 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
889 @*/
890 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
891 {
892   PetscErrorCode ierr;
893 
894   PetscFunctionBegin;
895   ierr = MatCreate(comm,A);CHKERRQ(ierr);
896   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
897   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
898   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
899   PetscFunctionReturn(0);
900 }
901 
902 #undef __FUNCT__
903 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
904 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
905 {
906   PetscErrorCode     ierr;
907   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
908 
909   PetscFunctionBegin;
910   if (A->factortype==MAT_FACTOR_NONE) {
911     try {
912       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
913         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
914       }
915       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
916       delete cusparseMat;
917       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
918     } catch(char *ex) {
919       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920     }
921   } else {
922     /* The triangular factors */
923     try {
924       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
925       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
926       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
927       delete (GPU_Matrix_Ifc*) cusparseMatLo;
928       delete (GPU_Matrix_Ifc*) cusparseMatUp;
929       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
930       delete cusparseTriFactors;
931     } catch(char *ex) {
932       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
933     }
934   }
935   if (MAT_cusparseHandle) {
936     cusparseStatus_t stat;
937     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
938 
939     MAT_cusparseHandle=0;
940   }
941   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
942   A->spptr = 0;
943 
944   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
945   PetscFunctionReturn(0);
946 }
947 
948 #undef __FUNCT__
949 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
950 PETSC_EXTERN_C PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
951 {
952   PetscErrorCode ierr;
953 
954   PetscFunctionBegin;
955   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
956   if (B->factortype==MAT_FACTOR_NONE) {
957     /* you cannot check the inode.use flag here since the matrix was just created.
958        now build a GPU matrix data structure */
959     B->spptr = new Mat_SeqAIJCUSPARSE;
960 
961     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
962     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
963     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
964     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
965   } else {
966     /* NEXT, set the pointers to the triangular factors */
967     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
968 
969     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
970     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
971     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
972     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
973     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
974   }
975   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
976   if (!MAT_cusparseHandle) {
977     cusparseStatus_t stat;
978     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
979   }
980   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
981      default cusparse tri solve. Note the difference with the implementation in
982      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
983   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
984 
985   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
986   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
987   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
988   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
989   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
990   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
991   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
992   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
993 
994   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
995 
996   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
997 
998   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
999   PetscFunctionReturn(0);
1000 }
1001 
1002 /*M
1003    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1004 
1005    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1006    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1007    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1008    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1009    the different GPU storage formats.
1010 
1011    Options Database Keys:
1012 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1013 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1014 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1015 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1016 
1017   Level: beginner
1018 
1019 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1020 M*/
1021