xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision f7a087814f99396ec53a0bad244374eaa0312225)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 /* this is such a hack ... but I don't know of another way to pass this variable
18    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19    SpMV. Essentially, I need to use the same stream variable in two different
20    data structures. I do this by creating a single instance of that stream
21    and reuse it. */
22 cudaStream_t theBodyStream=0;
23 
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
32 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
33 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
36 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
37 
38 #undef __FUNCT__
39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
41 {
42   PetscFunctionBegin;
43   *type = MATSOLVERCUSPARSE;
44   PetscFunctionReturn(0);
45 }
46 
47 EXTERN_C_BEGIN
48 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
49 EXTERN_C_END
50 /*MC
51   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices on a single GPU of type,
52   seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported algorithms are ILU(k) and LU. ICC(k) and
53   Cholesky will be supported in future versions. This class does NOT support direct solver operations.
54 
55   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE
56 
57   Consult CUSPARSE documentation for more information about the matrix storage formats which correspond to the options
58   database keys below.
59 
60   Options Database Keys:
61 .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
62 
63   Level: beginner
64 
65 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
66 M*/
67 
68 EXTERN_C_BEGIN
69 #undef __FUNCT__
70 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
71 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
72 {
73   PetscErrorCode ierr;
74 
75   PetscFunctionBegin;
76   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
77   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
78     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
79     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
80     ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
81 
82     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
83     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
84   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
85   (*B)->factortype = ftype;
86   PetscFunctionReturn(0);
87 }
88 EXTERN_C_END
89 
90 EXTERN_C_BEGIN
91 #undef __FUNCT__
92 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
93 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
94 {
95   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   switch (op) {
99   case MAT_CUSPARSE_MULT:
100     cusparseMat->format = format;
101     break;
102   case MAT_CUSPARSE_SOLVE:
103     cusparseMatSolveStorageFormat = format;
104     break;
105   case MAT_CUSPARSE_ALL:
106     cusparseMat->format           = format;
107     cusparseMatSolveStorageFormat = format;
108     break;
109   default:
110     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
111   }
112   PetscFunctionReturn(0);
113 }
114 EXTERN_C_END
115 
116 
117 /*@
118    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
119    operation. Only the MatMult operation can use different GPU storage formats
120    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
121    to build/install PETSc to use this package.
122 
123    Not Collective
124 
125    Input Parameters:
126 +  A - Matrix of type SEQAIJCUSPARSE
127 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
128 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
129 
130    Output Parameter:
131 
132    Level: intermediate
133 
134 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
135 @*/
136 #undef __FUNCT__
137 #define __FUNCT__ "MatCUSPARSESetFormat"
138 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
139 {
140   PetscErrorCode ierr;
141 
142   PetscFunctionBegin;
143   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
144   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 #undef __FUNCT__
149 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
150 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
151 {
152   PetscErrorCode           ierr;
153   MatCUSPARSEStorageFormat format;
154   PetscBool                flg;
155 
156   PetscFunctionBegin;
157   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
158   ierr = PetscObjectOptionsBegin((PetscObject)A);
159   if (A->factortype==MAT_FACTOR_NONE) {
160     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
161                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
162     if (flg) {
163       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
164     }
165   } else {
166     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
167                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
168     if (flg) {
169       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
170     }
171   }
172   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
173                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
174   if (flg) {
175     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
176   }
177   ierr = PetscOptionsEnd();CHKERRQ(ierr);
178   PetscFunctionReturn(0);
179 
180 }
181 
182 #undef __FUNCT__
183 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
184 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
185 {
186   PetscErrorCode ierr;
187 
188   PetscFunctionBegin;
189   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
190 
191   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
192   PetscFunctionReturn(0);
193 }
194 
195 #undef __FUNCT__
196 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
197 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203 
204   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
210 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
211 {
212   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
213   PetscInt                     n                   = A->rmap->n;
214   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
215   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
216   cusparseStatus_t             stat;
217   const PetscInt               *ai = a->i,*aj = a->j,*vi;
218   const MatScalar              *aa = a->a,*v;
219   PetscErrorCode               ierr;
220   PetscInt                     *AiLo, *AjLo;
221   PetscScalar                  *AALo;
222   PetscInt                     i,nz, nzLower, offset, rowOffset;
223 
224   PetscFunctionBegin;
225   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
226     try {
227       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
228       nzLower=n+ai[n]-ai[1];
229 
230       /* Allocate Space for the lower triangular matrix */
231       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
232       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
233       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
234 
235       /* Fill the lower triangular matrix */
236       AiLo[0]  = (PetscInt) 0;
237       AiLo[n]  = nzLower;
238       AjLo[0]  = (PetscInt) 0;
239       AALo[0]  = (MatScalar) 1.0;
240       v        = aa;
241       vi       = aj;
242       offset   = 1;
243       rowOffset= 1;
244       for (i=1; i<n; i++) {
245         nz = ai[i+1] - ai[i];
246         /* additional 1 for the term on the diagonal */
247         AiLo[i]    = rowOffset;
248         rowOffset += nz+1;
249 
250         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
251         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
252 
253         offset      += nz;
254         AjLo[offset] = (PetscInt) i;
255         AALo[offset] = (MatScalar) 1.0;
256         offset      += 1;
257 
258         v  += nz;
259         vi += nz;
260       }
261       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
262 
263       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
264       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
265       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
266 
267       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
268 
269       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
270       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
271       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
272     } catch(char *ex) {
273       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
274     }
275   }
276   PetscFunctionReturn(0);
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
281 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
282 {
283   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
284   PetscInt                     n                   = A->rmap->n;
285   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
286   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
287   cusparseStatus_t             stat;
288   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
289   const MatScalar              *aa = a->a,*v;
290   PetscInt                     *AiUp, *AjUp;
291   PetscScalar                  *AAUp;
292   PetscInt                     i,nz, nzUpper, offset;
293   PetscErrorCode               ierr;
294 
295   PetscFunctionBegin;
296   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
297     try {
298       /* next, figure out the number of nonzeros in the upper triangular matrix. */
299       nzUpper = adiag[0]-adiag[n];
300 
301       /* Allocate Space for the upper triangular matrix */
302       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
303       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
304       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
305 
306       /* Fill the upper triangular matrix */
307       AiUp[0]=(PetscInt) 0;
308       AiUp[n]=nzUpper;
309       offset = nzUpper;
310       for (i=n-1; i>=0; i--) {
311         v  = aa + adiag[i+1] + 1;
312         vi = aj + adiag[i+1] + 1;
313 
314         /* number of elements NOT on the diagonal */
315         nz = adiag[i] - adiag[i+1]-1;
316 
317         /* decrement the offset */
318         offset -= (nz+1);
319 
320         /* first, set the diagonal elements */
321         AjUp[offset] = (PetscInt) i;
322         AAUp[offset] = 1./v[nz];
323         AiUp[i]      = AiUp[i+1] - (nz+1);
324 
325         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
326         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
327       }
328       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
329 
330       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
331       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
332       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
333 
334       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
335 
336       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
337       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
338       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
339     } catch(char *ex) {
340       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
341     }
342   }
343   PetscFunctionReturn(0);
344 }
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
348 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
349 {
350   PetscErrorCode               ierr;
351   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
352   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
353   IS                           isrow               = a->row,iscol = a->icol;
354   PetscBool                    row_identity,col_identity;
355   const PetscInt               *r,*c;
356   PetscInt                     n = A->rmap->n;
357 
358   PetscFunctionBegin;
359   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
360   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
361 
362   cusparseTriFactors->tempvec = new CUSPARRAY;
363   cusparseTriFactors->tempvec->resize(n);
364 
365   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
366   /*lower triangular indices */
367   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
368   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
369   if (!row_identity) {
370     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
371   }
372   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
373 
374   /*upper triangular indices */
375   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
376   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
377   if (!col_identity) {
378     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
379   }
380   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
386 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
387 {
388   PetscErrorCode ierr;
389   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
390   IS             isrow = b->row,iscol = b->col;
391   PetscBool      row_identity,col_identity;
392 
393   PetscFunctionBegin;
394   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
395   /* determine which version of MatSolve needs to be used. */
396   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
397   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
398   if (row_identity && col_identity) {
399     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
400     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
401   } else {
402     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
403     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
404   }
405 
406   /* get the triangular factors */
407   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
408   PetscFunctionReturn(0);
409 }
410 
411 
412 #undef __FUNCT__
413 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
414 PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
415 {
416   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
417   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
418   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
419   cusparseStatus_t stat;
420   PetscFunctionBegin;
421   stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle,
422                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
423                                               CUSPARSE_FILL_MODE_UPPER,
424                                               TRANSPOSE);CHKERRCUSP(stat);
425   stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
426 
427   stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle,
428                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
429                                               CUSPARSE_FILL_MODE_LOWER,
430                                               TRANSPOSE);CHKERRCUSP(stat);
431   stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
432   PetscFunctionReturn(0);
433 }
434 
435 #undef __FUNCT__
436 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
437 PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
438 {
439   PetscErrorCode     ierr;
440   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
441   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
442   cusparseStatus_t stat;
443   PetscFunctionBegin;
444   stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat);
445   ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr);
446   PetscFunctionReturn(0);
447 }
448 
449 #undef __FUNCT__
450 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
451 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
452 {
453   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
454   PetscErrorCode ierr;
455   CUSPARRAY      *xGPU, *bGPU;
456   cusparseStatus_t stat;
457   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
458   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
459   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
460   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
461 
462   PetscFunctionBegin;
463   /* Analyze the matrix ... on the fly */
464   if (!cusparseTriFactors->hasTranspose) {
465     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
466     cusparseTriFactors->hasTranspose=PETSC_TRUE;
467   }
468 
469   /* Get the GPU pointers */
470   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
471   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
472 
473   /* solve with reordering */
474   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
475   stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
476   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
477   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
478 
479   /* restore */
480   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
481   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
482   ierr = WaitForGPU();CHKERRCUSP(ierr);
483   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
487 #undef __FUNCT__
488 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
489 PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
490 {
491   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
492   PetscErrorCode    ierr;
493   CUSPARRAY         *xGPU, *bGPU;
494   cusparseStatus_t stat;
495   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
496   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
497   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
498   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
499 
500   PetscFunctionBegin;
501   /* Analyze the matrix ... on the fly */
502   if (!cusparseTriFactors->hasTranspose) {
503     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
504     cusparseTriFactors->hasTranspose=PETSC_TRUE;
505   }
506 
507   /* Get the GPU pointers */
508   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
509   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
510 
511   /* solve */
512   stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
513   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
514 
515   /* restore */
516   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
517   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
518   ierr = WaitForGPU();CHKERRCUSP(ierr);
519   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
520   PetscFunctionReturn(0);
521 }
522 
523 
524 #undef __FUNCT__
525 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
526 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
527 {
528   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
529   PetscErrorCode               ierr;
530   CUSPARRAY                    *xGPU, *bGPU;
531   cusparseStatus_t             stat;
532   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
533   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
534   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
535   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
536 
537   PetscFunctionBegin;
538   /* Get the GPU pointers */
539   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
540   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
541 
542   /* solve with reordering */
543   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
544   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
545   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
546   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
547 
548   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
549   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
550   ierr = WaitForGPU();CHKERRCUSP(ierr);
551   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
552   PetscFunctionReturn(0);
553 }
554 
555 
556 
557 
558 #undef __FUNCT__
559 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
560 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
561 {
562   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
563   PetscErrorCode               ierr;
564   CUSPARRAY                    *xGPU, *bGPU;
565   cusparseStatus_t             stat;
566   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
567   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
568   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
569   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
570 
571   PetscFunctionBegin;
572   /* Get the GPU pointers */
573   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
574   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
575 
576   /* solve */
577   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
578   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
579 
580   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
581   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
582   ierr = WaitForGPU();CHKERRCUSP(ierr);
583   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
584   PetscFunctionReturn(0);
585 }
586 
587 #undef __FUNCT__
588 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
589 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
590 {
591 
592   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
593   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
594   PetscInt           m            = A->rmap->n,*ii,*ridx;
595   PetscErrorCode     ierr;
596 
597 
598   PetscFunctionBegin;
599   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
600     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
601     /*
602       It may be possible to reuse nonzero structure with new matrix values but
603       for simplicity and insured correctness we delete and build a new matrix on
604       the GPU. Likely a very small performance hit.
605     */
606     if (cusparseMat->mat) {
607       try {
608         delete cusparseMat->mat;
609         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
610 
611       } catch(char *ex) {
612         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
613       }
614     }
615     try {
616       cusparseMat->nonzerorow=0;
617       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
618 
619       if (a->compressedrow.use) {
620         m    = a->compressedrow.nrows;
621         ii   = a->compressedrow.i;
622         ridx = a->compressedrow.rindex;
623       } else {
624         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
625         int k=0;
626         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
627         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
628         ii[0]=0;
629         for (int j = 0; j<m; j++) {
630           if ((a->i[j+1]-a->i[j])>0) {
631             ii[k]  = a->i[j];
632             ridx[k]= j;
633             k++;
634           }
635         }
636         ii[cusparseMat->nonzerorow] = a->nz;
637 
638         m = cusparseMat->nonzerorow;
639       }
640 
641       /* Build our matrix ... first determine the GPU storage type */
642       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
643 
644       /* Create the streams and events (if desired).  */
645       PetscMPIInt size;
646       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
647       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
648 
649       /* FILL MODE UPPER is irrelevant */
650       cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
651 
652       /* lastly, build the matrix */
653       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
654       cusparseMat->mat->setCPRowIndices(ridx, m);
655       if (!a->compressedrow.use) {
656         ierr = PetscFree(ii);CHKERRQ(ierr);
657         ierr = PetscFree(ridx);CHKERRQ(ierr);
658       }
659       cusparseMat->tempvec = new CUSPARRAY;
660       cusparseMat->tempvec->resize(m);
661     } catch(char *ex) {
662       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
663     }
664     ierr = WaitForGPU();CHKERRCUSP(ierr);
665 
666     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
667 
668     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
669   }
670   PetscFunctionReturn(0);
671 }
672 
673 #undef __FUNCT__
674 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
675 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
676 {
677   PetscErrorCode ierr;
678 
679   PetscFunctionBegin;
680   if (right) {
681     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
682     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
683     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
684     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
685     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
686   }
687   if (left) {
688     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
689     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
690     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
691     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
692     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
693   }
694   PetscFunctionReturn(0);
695 }
696 
697 #undef __FUNCT__
698 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
699 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
700 {
701   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
702   PetscErrorCode     ierr;
703   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
704   CUSPARRAY          *xarray,*yarray;
705 
706   PetscFunctionBegin;
707   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
708      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
709   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
710   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
711   try {
712     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
713   } catch (char *ex) {
714     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
715   }
716   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
717   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
718   if (!cusparseMat->mat->hasNonZeroStream()) {
719     ierr = WaitForGPU();CHKERRCUSP(ierr);
720   }
721   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
722   PetscFunctionReturn(0);
723 }
724 
725 
726 #undef __FUNCT__
727 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
728 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
729 {
730   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
731   PetscErrorCode     ierr;
732   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
733   CUSPARRAY          *xarray,*yarray;
734 
735   PetscFunctionBegin;
736   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
737      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
738   if (!cusparseMat->hasTranspose) {
739     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
740     cusparseMat->hasTranspose=PETSC_TRUE;
741   }
742   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
743   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
744   try {
745     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
746   } catch (char *ex) {
747     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
748   }
749   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
750   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
751   if (!cusparseMat->mat->hasNonZeroStream()) {
752     ierr = WaitForGPU();CHKERRCUSP(ierr);
753   }
754   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
755   PetscFunctionReturn(0);
756 }
757 
758 #undef __FUNCT__
759 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
760 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
761 {
762   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
763   PetscErrorCode     ierr;
764   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
765   CUSPARRAY          *xarray,*yarray,*zarray;
766 
767   PetscFunctionBegin;
768   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
769      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
770   try {
771     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
772     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
773     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
774     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
775 
776     /* multiply add */
777     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
778 
779     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
780     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
781     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
782 
783   } catch(char *ex) {
784     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
785   }
786   ierr = WaitForGPU();CHKERRCUSP(ierr);
787   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
788   PetscFunctionReturn(0);
789 }
790 
791 #undef __FUNCT__
792 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
793 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
794 {
795   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
796   PetscErrorCode     ierr;
797   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
798   CUSPARRAY          *xarray,*yarray,*zarray;
799 
800   PetscFunctionBegin;
801   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
802      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
803   if (!cusparseMat->hasTranspose) {
804     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
805     cusparseMat->hasTranspose=PETSC_TRUE;
806   }
807   try {
808     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
809     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
810     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
811     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
812 
813     /* multiply add with matrix transpose */
814     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
815 
816     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
817     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
818     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
819 
820   } catch(char *ex) {
821     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
822   }
823   ierr = WaitForGPU();CHKERRCUSP(ierr);
824   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
825   PetscFunctionReturn(0);
826 }
827 
828 #undef __FUNCT__
829 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
830 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
831 {
832   PetscErrorCode ierr;
833 
834   PetscFunctionBegin;
835   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
836   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
837   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
838   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
839   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
840   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
841   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
842   PetscFunctionReturn(0);
843 }
844 
845 /* --------------------------------------------------------------------------------*/
846 #undef __FUNCT__
847 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
848 /*@
849    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
850    (the default parallel PETSc format). This matrix will ultimately pushed down
851    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
852    assembly performance the user should preallocate the matrix storage by setting
853    the parameter nz (or the array nnz).  By setting these parameters accurately,
854    performance during matrix assembly can be increased by more than a factor of 50.
855 
856    Collective on MPI_Comm
857 
858    Input Parameters:
859 +  comm - MPI communicator, set to PETSC_COMM_SELF
860 .  m - number of rows
861 .  n - number of columns
862 .  nz - number of nonzeros per row (same for all rows)
863 -  nnz - array containing the number of nonzeros in the various rows
864          (possibly different for each row) or NULL
865 
866    Output Parameter:
867 .  A - the matrix
868 
869    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
870    MatXXXXSetPreallocation() paradgm instead of this routine directly.
871    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
872 
873    Notes:
874    If nnz is given then nz is ignored
875 
876    The AIJ format (also called the Yale sparse matrix format or
877    compressed row storage), is fully compatible with standard Fortran 77
878    storage.  That is, the stored row and column indices can begin at
879    either one (as in Fortran) or zero.  See the users' manual for details.
880 
881    Specify the preallocated storage with either nz or nnz (not both).
882    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
883    allocation.  For large problems you MUST preallocate memory or you
884    will get TERRIBLE performance, see the users' manual chapter on matrices.
885 
886    By default, this format uses inodes (identical nodes) when possible, to
887    improve numerical efficiency of matrix-vector products and solves. We
888    search for consecutive rows with the same nonzero structure, thereby
889    reusing matrix information to achieve increased efficiency.
890 
891    Level: intermediate
892 
893 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
894 @*/
895 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
896 {
897   PetscErrorCode ierr;
898 
899   PetscFunctionBegin;
900   ierr = MatCreate(comm,A);CHKERRQ(ierr);
901   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
902   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
903   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
904   PetscFunctionReturn(0);
905 }
906 
907 #undef __FUNCT__
908 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
909 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
910 {
911   PetscErrorCode     ierr;
912   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
913 
914   PetscFunctionBegin;
915   if (A->factortype==MAT_FACTOR_NONE) {
916     try {
917       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
918         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
919       }
920       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
921       delete cusparseMat;
922       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
923     } catch(char *ex) {
924       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
925     }
926   } else {
927     /* The triangular factors */
928     try {
929       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
931       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
932       delete (GPU_Matrix_Ifc*) cusparseMatLo;
933       delete (GPU_Matrix_Ifc*) cusparseMatUp;
934       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
935       delete cusparseTriFactors;
936     } catch(char *ex) {
937       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
938     }
939   }
940   if (MAT_cusparseHandle) {
941     cusparseStatus_t stat;
942     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
943 
944     MAT_cusparseHandle=0;
945   }
946   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
947   A->spptr = 0;
948 
949   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
950   PetscFunctionReturn(0);
951 }
952 
953 EXTERN_C_BEGIN
954 #undef __FUNCT__
955 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
956 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
957 {
958   PetscErrorCode ierr;
959 
960   PetscFunctionBegin;
961   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
962   if (B->factortype==MAT_FACTOR_NONE) {
963     /* you cannot check the inode.use flag here since the matrix was just created.
964        now build a GPU matrix data structure */
965     B->spptr = new Mat_SeqAIJCUSPARSE;
966 
967     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
968     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
969     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
970     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
971   } else {
972     /* NEXT, set the pointers to the triangular factors */
973     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
974 
975     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
976     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
977     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
978     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
979     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
980   }
981   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
982   if (!MAT_cusparseHandle) {
983     cusparseStatus_t stat;
984     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
985   }
986   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
987      default cusparse tri solve. Note the difference with the implementation in
988      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
989   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
990 
991   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
992   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
993   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
994   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
995   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
996   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
997   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
998   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
999 
1000   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1001 
1002   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1003 
1004   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1005   PetscFunctionReturn(0);
1006 }
1007 EXTERN_C_END
1008 
1009 /*M
1010    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1011 
1012    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1013    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1014    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1015    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1016    the different GPU storage formats.
1017 
1018    Options Database Keys:
1019 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1020 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1021 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1022 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1023 
1024   Level: beginner
1025 
1026 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1027 M*/
1028