xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 784ad74dfe8240901f89316a5dd0cd307000ed2b)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 
16 // this is such a hack ... but I don't know of another way to pass this variable
17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18 //  SpMV. Essentially, I need to use the same stream variable in two different
19 //  data structures. I do this by creating a single instance of that stream
20 //  and reuse it.
21 cudaStream_t theBodyStream=0;
22 
23 EXTERN_C_BEGIN
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat);
31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 EXTERN_C_END
36 
37 EXTERN_C_BEGIN
38 #undef __FUNCT__
39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
41 {
42   PetscFunctionBegin;
43   *type = MATSOLVERCUSPARSE;
44   PetscFunctionReturn(0);
45 }
46 EXTERN_C_END
47 
48 EXTERN_C_BEGIN
49 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
50 EXTERN_C_END
51 
52 
53 
54 /*MC
55   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
56   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
57 5~
58   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves
59 
60   Options Database Keys:
61 + -mat_mult_cusparse_storage_format <CSR>         - (choose one of) CSR, ELL, HYB
62 + -mat_solve_cusparse_storage_format <CSR>        - (choose one of) CSR, ELL, HYB
63 
64    Level: beginner
65 M*/
66 
67 EXTERN_C_BEGIN
68 #undef __FUNCT__
69 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
70 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
71 {
72   PetscErrorCode     ierr;
73 
74   PetscFunctionBegin;
75   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
76   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
77     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
78     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
79     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
80     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
81     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
82   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
83   (*B)->factortype = ftype;
84   PetscFunctionReturn(0);
85 }
86 EXTERN_C_END
87 
88 #undef __FUNCT__
89 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve"
90 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format)
91 {
92   PetscErrorCode     ierr;
93   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
94   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
95   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
96   PetscFunctionBegin;
97   cusparseTriFactors->format = format;
98   if (cusparseMatLo && cusparseMatUp) {
99     // If data exists already delete
100     if (cusparseMatLo)
101       delete (GPU_Matrix_Ifc *)cusparseMatLo;
102     if (cusparseMatUp)
103       delete (GPU_Matrix_Ifc *)cusparseMatUp;
104 
105     // key data was deleted so reset the flag.
106     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
107 
108     // rebuild matrix
109     ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr);
110   }
111   PetscFunctionReturn(0);
112 }
113 
114 //EXTERN_C_BEGIN
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE"
117 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool  flg)
118 {
119   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
120   PetscErrorCode ierr;
121   PetscFunctionBegin;
122   ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
123   switch (op) {
124   case MAT_DIAGBLOCK_CSR:
125   case MAT_OFFDIAGBLOCK_CSR:
126   case MAT_CSR:
127     cusparseMat->format = CSR;
128     break;
129   case MAT_DIAGBLOCK_DIA:
130   case MAT_OFFDIAGBLOCK_DIA:
131   case MAT_DIA:
132     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
133   case MAT_DIAGBLOCK_ELL:
134   case MAT_OFFDIAGBLOCK_ELL:
135   case MAT_ELL:
136     cusparseMat->format = ELL;
137     break;
138   case MAT_DIAGBLOCK_HYB:
139   case MAT_OFFDIAGBLOCK_HYB:
140   case MAT_HYB:
141     cusparseMat->format = HYB;
142     break;
143   default:
144     break;
145   }
146   PetscFunctionReturn(0);
147 }
148 //EXTERN_C_END
149 
150 //EXTERN_C_BEGIN
151 #undef __FUNCT__
152 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
153 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
154 {
155   PetscErrorCode     ierr;
156   PetscInt       idx=0;
157   char * formats[] ={CSR,ELL,HYB};
158   MatOption format;
159   PetscBool      flg;
160   PetscFunctionBegin;
161   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr);
162   if (A->factortype==MAT_FACTOR_NONE) {
163     ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format",
164 			     "Set the storage format of (seq)aijcusparse gpu matrices for SpMV",
165 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
166     if (formats[idx] == CSR)
167       format=MAT_CSR;
168     else if (formats[idx] == ELL)
169       format=MAT_ELL;
170     else if (formats[idx] == HYB)
171       format=MAT_HYB;
172 
173     ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr);
174   }
175   else {
176     ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format",
177 			     "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve",
178 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
179     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr);
180   }
181   ierr = PetscOptionsEnd();CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 
184 }
185 //EXTERN_C_END
186 
187 #undef __FUNCT__
188 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
189 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
190 {
191   PetscErrorCode     ierr;
192 
193   PetscFunctionBegin;
194   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
195   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
201 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
202 {
203   PetscErrorCode     ierr;
204 
205   PetscFunctionBegin;
206   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
207   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
208   PetscFunctionReturn(0);
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix"
213 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
214 {
215   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
216   PetscInt          n = A->rmap->n;
217   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
218   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
219   cusparseStatus_t stat;
220   const PetscInt    *ai = a->i,*aj = a->j,*vi;
221   const MatScalar   *aa = a->a,*v;
222   PetscErrorCode     ierr;
223   PetscInt *AiLo, *AjLo;
224   PetscScalar *AALo;
225   PetscInt i,nz, nzLower, offset, rowOffset;
226 
227   PetscFunctionBegin;
228   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
229     try {
230       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
231       nzLower=n+ai[n]-ai[1];
232 
233       /* Allocate Space for the lower triangular matrix */
234       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
235       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
236       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
237 
238       /* Fill the lower triangular matrix */
239       AiLo[0]=(PetscInt) 0;
240       AiLo[n]=nzLower;
241       AjLo[0]=(PetscInt) 0;
242       AALo[0]=(MatScalar) 1.0;
243       v    = aa;
244       vi   = aj;
245       offset=1;
246       rowOffset=1;
247       for (i=1; i<n; i++) {
248 	nz  = ai[i+1] - ai[i];
249 	// additional 1 for the term on the diagonal
250 	AiLo[i]=rowOffset;
251 	rowOffset+=nz+1;
252 
253 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
254 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
255 
256 	offset+=nz;
257 	AjLo[offset]=(PetscInt) i;
258 	AALo[offset]=(MatScalar) 1.0;
259 	offset+=1;
260 
261 	v  += nz;
262 	vi += nz;
263       }
264       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
265       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
266       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
267       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
268       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
269       // free temporary data
270       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
271       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
272       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
273     } catch(char* ex) {
274       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
275     }
276   }
277   PetscFunctionReturn(0);
278 }
279 
280 #undef __FUNCT__
281 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix"
282 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
283 {
284   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
285   PetscInt          n = A->rmap->n;
286   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
287   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
288   cusparseStatus_t stat;
289   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
290   const MatScalar   *aa = a->a,*v;
291   PetscInt *AiUp, *AjUp;
292   PetscScalar *AAUp;
293   PetscInt i,nz, nzUpper, offset;
294   PetscErrorCode     ierr;
295 
296   PetscFunctionBegin;
297 
298   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
299     try {
300       /* next, figure out the number of nonzeros in the upper triangular matrix. */
301       nzUpper = adiag[0]-adiag[n];
302 
303       /* Allocate Space for the upper triangular matrix */
304       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
305       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
306       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
307 
308       /* Fill the upper triangular matrix */
309       AiUp[0]=(PetscInt) 0;
310       AiUp[n]=nzUpper;
311       offset = nzUpper;
312       for (i=n-1; i>=0; i--){
313 	v   = aa + adiag[i+1] + 1;
314 	vi  = aj + adiag[i+1] + 1;
315 
316 	// number of elements NOT on the diagonal
317 	nz = adiag[i] - adiag[i+1]-1;
318 
319 	// decrement the offset
320 	offset -= (nz+1);
321 
322 	// first, set the diagonal elements
323 	AjUp[offset] = (PetscInt) i;
324 	AAUp[offset] = 1./v[nz];
325 	AiUp[i] = AiUp[i+1] - (nz+1);
326 
327 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
328 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
329       }
330       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
331       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
332       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
333       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
334       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
335       // free temporary data
336       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
337       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
338       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
339     } catch(char* ex) {
340       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
341     }
342   }
343   PetscFunctionReturn(0);
344 }
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU"
348 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
349 {
350   PetscErrorCode     ierr;
351   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
352   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
353   IS               isrow = a->row,iscol = a->icol;
354   PetscBool        row_identity,col_identity;
355   const PetscInt   *r,*c;
356   PetscInt          n = A->rmap->n;
357 
358   PetscFunctionBegin;
359   ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
360   ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
361   cusparseTriFactors->tempvec = new CUSPARRAY;
362   cusparseTriFactors->tempvec->resize(n);
363 
364   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
365   //lower triangular indices
366   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
367   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
368   if (!row_identity)
369     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
370   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
371 
372   //upper triangular indices
373   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
374   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
375   if (!col_identity)
376     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
377   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 #undef __FUNCT__
382 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
383 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
384 {
385   PetscErrorCode   ierr;
386   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
387   IS               isrow = b->row,iscol = b->col;
388   PetscBool        row_identity,col_identity;
389 
390   PetscFunctionBegin;
391   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
392   // determine which version of MatSolve needs to be used.
393   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
394   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
395   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
396   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
397 
398   //if (!row_identity) printf("Row permutations exist!");
399   //if (!col_identity) printf("Col permutations exist!");
400 
401   // get the triangular factors
402   ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
403   PetscFunctionReturn(0);
404 }
405 
406 
407 
408 #undef __FUNCT__
409 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
410 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
411 {
412   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
413   PetscErrorCode ierr;
414   CUSPARRAY      *xGPU, *bGPU;
415   cusparseStatus_t stat;
416   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
417   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
418   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
419   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
420 
421   PetscFunctionBegin;
422   // Get the GPU pointers
423   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
424   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
425 
426   // solve with reordering
427   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
428   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
429   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
430   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
431 
432   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
433   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
434   ierr = WaitForGPU();CHKERRCUSP(ierr);
435   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
436   PetscFunctionReturn(0);
437 }
438 
439 
440 
441 
442 #undef __FUNCT__
443 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
444 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
445 {
446   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
447   PetscErrorCode    ierr;
448   CUSPARRAY         *xGPU, *bGPU;
449   cusparseStatus_t stat;
450   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
451   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
452   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
453   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
454 
455   PetscFunctionBegin;
456   // Get the GPU pointers
457   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
458   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
459 
460   // solve
461   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
462   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
463 
464   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
465   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
466   ierr = WaitForGPU();CHKERRCUSP(ierr);
467   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
468   PetscFunctionReturn(0);
469 }
470 
471 #undef __FUNCT__
472 #define __FUNCT__ "MatCUSPARSECopyToGPU"
473 PetscErrorCode MatCUSPARSECopyToGPU(Mat A)
474 {
475 
476   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
477   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
478   PetscInt        m           = A->rmap->n,*ii,*ridx;
479   PetscErrorCode  ierr;
480 
481 
482   PetscFunctionBegin;
483   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
484     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
485     /*
486       It may be possible to reuse nonzero structure with new matrix values but
487       for simplicity and insured correctness we delete and build a new matrix on
488       the GPU. Likely a very small performance hit.
489     */
490     if (cusparseMat->mat){
491       try {
492 	delete cusparseMat->mat;
493 	if (cusparseMat->tempvec)
494 	  delete cusparseMat->tempvec;
495 
496       } catch(char* ex) {
497 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
498       }
499     }
500     try {
501       cusparseMat->nonzerorow=0;
502       for (int j = 0; j<m; j++)
503 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
504 
505       if (a->compressedrow.use) {
506 	m    = a->compressedrow.nrows;
507 	ii   = a->compressedrow.i;
508 	ridx = a->compressedrow.rindex;
509       } else {
510 	// Forcing compressed row on the GPU ... only relevant for CSR storage
511 	int k=0;
512 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
513 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
514 	ii[0]=0;
515 	for (int j = 0; j<m; j++) {
516 	  if ((a->i[j+1]-a->i[j])>0) {
517 	    ii[k] = a->i[j];
518 	    ridx[k]= j;
519 	    k++;
520 	  }
521 	}
522 	ii[cusparseMat->nonzerorow] = a->nz;
523 	m = cusparseMat->nonzerorow;
524       }
525 
526       // Build our matrix ... first determine the GPU storage type
527       cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format);
528 
529       // Create the streams and events (if desired).
530       PetscMPIInt    size;
531       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
532       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
533 
534       // FILL MODE UPPER is irrelevant
535       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
536 
537       // lastly, build the matrix
538       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
539       cusparseMat->mat->setCPRowIndices(ridx, m);
540 	//      }
541       if (!a->compressedrow.use) {
542 	// free data
543 	ierr = PetscFree(ii);CHKERRQ(ierr);
544 	ierr = PetscFree(ridx);CHKERRQ(ierr);
545       }
546       cusparseMat->tempvec = new CUSPARRAY;
547       cusparseMat->tempvec->resize(m);
548       //A->spptr = cusparseMat;
549     } catch(char* ex) {
550       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
551     }
552     ierr = WaitForGPU();CHKERRCUSP(ierr);
553     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
554     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
555   }
556   PetscFunctionReturn(0);
557 }
558 
559 // #undef __FUNCT__
560 // #define __FUNCT__ "MatCUSPARSECopyFromGPU"
561 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu)
562 // {
563 //   Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr;
564 //   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
565 //   PetscInt        m          = A->rmap->n;
566 //   PetscErrorCode  ierr;
567 
568 //   PetscFunctionBegin;
569 //   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
570 //     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
571 //       try {
572 //         cuspstruct->mat = Agpu;
573 //         if (a->compressedrow.use) {
574 //           //PetscInt *ii, *ridx;
575 //           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
576 //         } else {
577 //           PetscInt i;
578 
579 //           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
580 //           a->nz    = cuspstruct->mat->values.size();
581 //           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
582 //           A->preallocated = PETSC_TRUE;
583 //           // Copy ai, aj, aa
584 //           if (a->singlemalloc) {
585 //             if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
586 //           } else {
587 //             if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
588 //             if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
589 //             if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
590 //           }
591 //           ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
592 //           ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
593 //           a->singlemalloc = PETSC_TRUE;
594 //           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
595 //           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
596 //           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
597 //           // Setup row lengths
598 //           if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
599 //           ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
600 //           ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
601 //           for(i = 0; i < m; ++i) {
602 //             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
603 //           }
604 //           // a->diag?
605 //         }
606 //         cuspstruct->tempvec = new CUSPARRAY;
607 //         cuspstruct->tempvec->resize(m);
608 //       } catch(char *ex) {
609 //         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
610 //       }
611 //     }
612 //     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
613 //     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
614 //     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
615 //     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
616 //   } else {
617 //     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
618 //   }
619 //   PetscFunctionReturn(0);
620 // }
621 
622 #undef __FUNCT__
623 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
624 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
625 {
626   PetscErrorCode ierr;
627 
628   PetscFunctionBegin;
629 
630   if (right) {
631     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
632     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
633     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
634     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
635     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
636   }
637   if (left) {
638     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
639     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
640     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
641     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
642     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
643   }
644   PetscFunctionReturn(0);
645 }
646 
647 #undef __FUNCT__
648 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
649 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
650 {
651   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
652   PetscErrorCode ierr;
653   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
654   CUSPARRAY      *xarray,*yarray;
655 
656   PetscFunctionBegin;
657   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
658   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
659   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
660   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
661   try {
662     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
663   } catch (char* ex) {
664     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
665   }
666   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
667   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
668   if (!cusparseMat->mat->hasNonZeroStream()) {
669     ierr = WaitForGPU();CHKERRCUSP(ierr);
670   }
671   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
672   PetscFunctionReturn(0);
673 }
674 
675 
676 #undef __FUNCT__
677 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
678 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
679 {
680   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
681   PetscErrorCode ierr;
682   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
683   CUSPARRAY      *xarray,*yarray;
684 
685   PetscFunctionBegin;
686   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
687   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
688   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
689   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
690   try {
691 #if !defined(PETSC_USE_COMPLEX)
692     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
693 #else
694     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
695 #endif
696   } catch (char* ex) {
697     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
698   }
699   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
700   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
701   if (!cusparseMat->mat->hasNonZeroStream()) {
702     ierr = WaitForGPU();CHKERRCUSP(ierr);
703   }
704   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
705   PetscFunctionReturn(0);
706 }
707 
708 #undef __FUNCT__
709 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
710 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
711 {
712   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
713   PetscErrorCode ierr;
714   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
715   CUSPARRAY      *xarray,*yarray,*zarray;
716   PetscFunctionBegin;
717   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
718   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
719   try {
720     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
721     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
722     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
723     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
724 
725     // multiply add
726     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
727 
728     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
729     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
730     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
731 
732   } catch(char* ex) {
733     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
734   }
735   ierr = WaitForGPU();CHKERRCUSP(ierr);
736   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
737   PetscFunctionReturn(0);
738 }
739 
740 #undef __FUNCT__
741 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
742 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
743 {
744   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
745   PetscErrorCode ierr;
746   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
747   CUSPARRAY      *xarray,*yarray,*zarray;
748   PetscFunctionBegin;
749   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
750   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
751   try {
752     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
753     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
754     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
755     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
756 
757     // multiply add with matrix transpose
758 #if !defined(PETSC_USE_COMPLEX)
759     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
760 #else
761     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
762 #endif
763 
764     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
765     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
766     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
767 
768   } catch(char* ex) {
769     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
770   }
771   ierr = WaitForGPU();CHKERRCUSP(ierr);
772   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 #undef __FUNCT__
777 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
778 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
779 {
780   PetscErrorCode  ierr;
781   PetscFunctionBegin;
782   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
783   ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
784   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
785   // this is not necessary since MatCUSPARSECopyToGPU has been called.
786   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
787   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
788   //}
789   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
790   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
791   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
792   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
793   PetscFunctionReturn(0);
794 }
795 
796 /* --------------------------------------------------------------------------------*/
797 #undef __FUNCT__
798 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
799 /*@C
800    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
801    (the default parallel PETSc format).  For good matrix assembly performance
802    the user should preallocate the matrix storage by setting the parameter nz
803    (or the array nnz).  By setting these parameters accurately, performance
804    during matrix assembly can be increased by more than a factor of 50.
805 
806    Collective on MPI_Comm
807 
808    Input Parameters:
809 +  comm - MPI communicator, set to PETSC_COMM_SELF
810 .  m - number of rows
811 .  n - number of columns
812 .  nz - number of nonzeros per row (same for all rows)
813 -  nnz - array containing the number of nonzeros in the various rows
814          (possibly different for each row) or PETSC_NULL
815 
816    Output Parameter:
817 .  A - the matrix
818 
819    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
820    MatXXXXSetPreallocation() paradgm instead of this routine directly.
821    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
822 
823    Notes:
824    If nnz is given then nz is ignored
825 
826    The AIJ format (also called the Yale sparse matrix format or
827    compressed row storage), is fully compatible with standard Fortran 77
828    storage.  That is, the stored row and column indices can begin at
829    either one (as in Fortran) or zero.  See the users' manual for details.
830 
831    Specify the preallocated storage with either nz or nnz (not both).
832    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
833    allocation.  For large problems you MUST preallocate memory or you
834    will get TERRIBLE performance, see the users' manual chapter on matrices.
835 
836    By default, this format uses inodes (identical nodes) when possible, to
837    improve numerical efficiency of matrix-vector products and solves. We
838    search for consecutive rows with the same nonzero structure, thereby
839    reusing matrix information to achieve increased efficiency.
840 
841    Level: intermediate
842 
843 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
844 
845 @*/
846 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
847 {
848   PetscErrorCode ierr;
849 
850   PetscFunctionBegin;
851   ierr = MatCreate(comm,A);CHKERRQ(ierr);
852   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
853   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
854   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
855   PetscFunctionReturn(0);
856 }
857 
858 #undef __FUNCT__
859 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
860 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
861 {
862   PetscErrorCode        ierr;
863   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
864 
865   PetscFunctionBegin;
866   if (A->factortype==MAT_FACTOR_NONE) {
867     // The regular matrices
868     try {
869       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
870 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
871       }
872       if (cusparseMat->tempvec!=0)
873 	delete cusparseMat->tempvec;
874       delete cusparseMat;
875       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
876     } catch(char* ex) {
877       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
878     }
879   } else {
880     // The triangular factors
881     try {
882       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
883       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
884       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
885       // destroy the matrix and the container
886       delete (GPU_Matrix_Ifc *)cusparseMatLo;
887       delete (GPU_Matrix_Ifc *)cusparseMatUp;
888       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
889       delete cusparseTriFactors;
890     } catch(char* ex) {
891       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
892     }
893   }
894   if (MAT_cusparseHandle) {
895     cusparseStatus_t stat;
896     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
897     MAT_cusparseHandle=0;
898   }
899   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
900   A->spptr = 0;
901 
902   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
903   PetscFunctionReturn(0);
904 }
905 
906 
907 EXTERN_C_BEGIN
908 #undef __FUNCT__
909 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
910 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
911 {
912   PetscErrorCode ierr;
913   GPUStorageFormat format = CSR;
914 
915   PetscFunctionBegin;
916   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
917   if (B->factortype==MAT_FACTOR_NONE) {
918     /* you cannot check the inode.use flag here since the matrix was just created.*/
919     // now build a GPU matrix data structure
920     B->spptr        = new Mat_SeqAIJCUSPARSE;
921     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
922     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
923     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format;
924   } else {
925     /* NEXT, set the pointers to the triangular factors */
926     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
927     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
928     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
929     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
930     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format;
931   }
932   // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...)
933   if (!MAT_cusparseHandle) {
934     cusparseStatus_t stat;
935     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
936   }
937   // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
938   // default cusparse tri solve. Note the difference with the implementation in
939   // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu
940   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
941   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
942   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
943   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
944   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
945   B->ops->setoption        = MatSetOption_SeqAIJCUSPARSE;
946   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
947   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
948   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
949   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
950   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
951   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
952   PetscFunctionReturn(0);
953 }
954 EXTERN_C_END
955 
956