xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision c0174eb74783e1ccfa817272a5782dd72b4cfcef)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 
16 // this is such a hack ... but I don't know of another way to pass this variable
17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18 //  SpMV. Essentially, I need to use the same stream variable in two different
19 //  data structures. I do this by creating a single instance of that stream
20 //  and reuse it.
21 cudaStream_t theBodyStream=0;
22 
23 EXTERN_C_BEGIN
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat);
31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 EXTERN_C_END
36 
37 EXTERN_C_BEGIN
38 #undef __FUNCT__
39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
41 {
42   PetscFunctionBegin;
43   *type = MATSOLVERCUSPARSE;
44   PetscFunctionReturn(0);
45 }
46 EXTERN_C_END
47 
48 EXTERN_C_BEGIN
49 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
50 EXTERN_C_END
51 
52 
53 
54 /*MC
55   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
56   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
57 5~
58   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves
59 
60   Options Database Keys:
61 + -mat_mult_cusparse_storage_format <CSR>         - (choose one of) CSR, ELL, HYB
62 + -mat_solve_cusparse_storage_format <CSR>        - (choose one of) CSR, ELL, HYB
63 
64    Level: beginner
65 M*/
66 
67 EXTERN_C_BEGIN
68 #undef __FUNCT__
69 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
70 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
71 {
72   PetscErrorCode     ierr;
73 
74   PetscFunctionBegin;
75   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
76   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
77     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
78     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
79     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
80     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
81     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
82   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
83   (*B)->factortype = ftype;
84   PetscFunctionReturn(0);
85 }
86 EXTERN_C_END
87 
88 #undef __FUNCT__
89 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve"
90 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format)
91 {
92   PetscErrorCode     ierr;
93   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
94   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
95   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
96   PetscFunctionBegin;
97   cusparseTriFactors->format = format;
98   if (cusparseMatLo && cusparseMatUp) {
99     // If data exists already delete
100     if (cusparseMatLo)
101       delete (GPU_Matrix_Ifc *)cusparseMatLo;
102     if (cusparseMatUp)
103       delete (GPU_Matrix_Ifc *)cusparseMatUp;
104 
105     // key data was deleted so reset the flag.
106     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
107 
108     // rebuild matrix
109     ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr);
110   }
111   PetscFunctionReturn(0);
112 }
113 
114 //EXTERN_C_BEGIN
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE"
117 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool  flg)
118 {
119   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
120   PetscErrorCode ierr;
121   PetscFunctionBegin;
122   ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
123   switch (op) {
124   case MAT_DIAGBLOCK_CSR:
125   case MAT_OFFDIAGBLOCK_CSR:
126   case MAT_CSR:
127     cusparseMat->format = CSR;
128     break;
129   case MAT_DIAGBLOCK_DIA:
130   case MAT_OFFDIAGBLOCK_DIA:
131   case MAT_DIA:
132     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
133   case MAT_DIAGBLOCK_ELL:
134   case MAT_OFFDIAGBLOCK_ELL:
135   case MAT_ELL:
136     cusparseMat->format = ELL;
137     break;
138   case MAT_DIAGBLOCK_HYB:
139   case MAT_OFFDIAGBLOCK_HYB:
140   case MAT_HYB:
141     cusparseMat->format = HYB;
142     break;
143   default:
144     break;
145   }
146   PetscFunctionReturn(0);
147 }
148 //EXTERN_C_END
149 
150 //EXTERN_C_BEGIN
151 #undef __FUNCT__
152 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
153 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
154 {
155   PetscErrorCode     ierr;
156   PetscInt       idx=0;
157   char * formats[] ={CSR,ELL,HYB};
158   MatOption format;
159   PetscBool      flg;
160   PetscFunctionBegin;
161   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr);
162   if (A->factortype==MAT_FACTOR_NONE) {
163     ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format",
164 			     "Set the storage format of (seq)aijcusparse gpu matrices for SpMV",
165 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
166     switch (idx)
167       {
168       case 0:
169 	format=MAT_CSR;
170 	break;
171       case 2:
172 	format=MAT_ELL;
173 	break;
174       case 3:
175 	format=MAT_HYB;
176 	break;
177       }
178     ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr);
179   }
180   else {
181     ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format",
182 			     "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve",
183 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
184     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr);
185   }
186   ierr = PetscOptionsEnd();CHKERRQ(ierr);
187   PetscFunctionReturn(0);
188 
189 }
190 //EXTERN_C_END
191 
192 #undef __FUNCT__
193 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
194 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
195 {
196   PetscErrorCode     ierr;
197 
198   PetscFunctionBegin;
199   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
200   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
201   PetscFunctionReturn(0);
202 }
203 
204 #undef __FUNCT__
205 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
206 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
207 {
208   PetscErrorCode     ierr;
209 
210   PetscFunctionBegin;
211   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
212   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix"
218 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
219 {
220   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
221   PetscInt          n = A->rmap->n;
222   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
223   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
224   cusparseStatus_t stat;
225   const PetscInt    *ai = a->i,*aj = a->j,*vi;
226   const MatScalar   *aa = a->a,*v;
227   PetscErrorCode     ierr;
228   PetscInt *AiLo, *AjLo;
229   PetscScalar *AALo;
230   PetscInt i,nz, nzLower, offset, rowOffset;
231 
232   PetscFunctionBegin;
233   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
234     try {
235       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
236       nzLower=n+ai[n]-ai[1];
237 
238       /* Allocate Space for the lower triangular matrix */
239       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
240       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
241       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
242 
243       /* Fill the lower triangular matrix */
244       AiLo[0]=(PetscInt) 0;
245       AiLo[n]=nzLower;
246       AjLo[0]=(PetscInt) 0;
247       AALo[0]=(MatScalar) 1.0;
248       v    = aa;
249       vi   = aj;
250       offset=1;
251       rowOffset=1;
252       for (i=1; i<n; i++) {
253 	nz  = ai[i+1] - ai[i];
254 	// additional 1 for the term on the diagonal
255 	AiLo[i]=rowOffset;
256 	rowOffset+=nz+1;
257 
258 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
259 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
260 
261 	offset+=nz;
262 	AjLo[offset]=(PetscInt) i;
263 	AALo[offset]=(MatScalar) 1.0;
264 	offset+=1;
265 
266 	v  += nz;
267 	vi += nz;
268       }
269       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
270       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
271       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
272       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
273       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
274       // free temporary data
275       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
276       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
277       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
278     } catch(char* ex) {
279       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
280     }
281   }
282   PetscFunctionReturn(0);
283 }
284 
285 #undef __FUNCT__
286 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix"
287 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
288 {
289   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
290   PetscInt          n = A->rmap->n;
291   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
292   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
293   cusparseStatus_t stat;
294   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
295   const MatScalar   *aa = a->a,*v;
296   PetscInt *AiUp, *AjUp;
297   PetscScalar *AAUp;
298   PetscInt i,nz, nzUpper, offset;
299   PetscErrorCode     ierr;
300 
301   PetscFunctionBegin;
302 
303   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
304     try {
305       /* next, figure out the number of nonzeros in the upper triangular matrix. */
306       nzUpper = adiag[0]-adiag[n];
307 
308       /* Allocate Space for the upper triangular matrix */
309       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
310       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
311       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
312 
313       /* Fill the upper triangular matrix */
314       AiUp[0]=(PetscInt) 0;
315       AiUp[n]=nzUpper;
316       offset = nzUpper;
317       for (i=n-1; i>=0; i--){
318 	v   = aa + adiag[i+1] + 1;
319 	vi  = aj + adiag[i+1] + 1;
320 
321 	// number of elements NOT on the diagonal
322 	nz = adiag[i] - adiag[i+1]-1;
323 
324 	// decrement the offset
325 	offset -= (nz+1);
326 
327 	// first, set the diagonal elements
328 	AjUp[offset] = (PetscInt) i;
329 	AAUp[offset] = 1./v[nz];
330 	AiUp[i] = AiUp[i+1] - (nz+1);
331 
332 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
333 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
334       }
335       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
336       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
337       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
338       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
339       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
340       // free temporary data
341       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
342       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
343       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
344     } catch(char* ex) {
345       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
346     }
347   }
348   PetscFunctionReturn(0);
349 }
350 
351 #undef __FUNCT__
352 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU"
353 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
354 {
355   PetscErrorCode     ierr;
356   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
357   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
358   IS               isrow = a->row,iscol = a->icol;
359   PetscBool        row_identity,col_identity;
360   const PetscInt   *r,*c;
361   PetscInt          n = A->rmap->n;
362 
363   PetscFunctionBegin;
364   ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
365   ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
366   cusparseTriFactors->tempvec = new CUSPARRAY;
367   cusparseTriFactors->tempvec->resize(n);
368 
369   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
370   //lower triangular indices
371   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
372   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
373   if (!row_identity)
374     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
375   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
376 
377   //upper triangular indices
378   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
379   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
380   if (!col_identity)
381     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
382   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
388 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
389 {
390   PetscErrorCode   ierr;
391   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
392   IS               isrow = b->row,iscol = b->col;
393   PetscBool        row_identity,col_identity;
394 
395   PetscFunctionBegin;
396   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
397   // determine which version of MatSolve needs to be used.
398   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
399   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
400   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
401   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
402 
403   //if (!row_identity) printf("Row permutations exist!");
404   //if (!col_identity) printf("Col permutations exist!");
405 
406   // get the triangular factors
407   ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
408   PetscFunctionReturn(0);
409 }
410 
411 
412 
413 #undef __FUNCT__
414 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
415 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
416 {
417   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
418   PetscErrorCode ierr;
419   CUSPARRAY      *xGPU, *bGPU;
420   cusparseStatus_t stat;
421   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
422   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
423   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
424   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
425 
426   PetscFunctionBegin;
427   // Get the GPU pointers
428   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
429   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
430 
431   // solve with reordering
432   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
433   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
434   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
435   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
436 
437   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
438   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
439   ierr = WaitForGPU();CHKERRCUSP(ierr);
440   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
441   PetscFunctionReturn(0);
442 }
443 
444 
445 
446 
447 #undef __FUNCT__
448 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
449 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
450 {
451   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
452   PetscErrorCode    ierr;
453   CUSPARRAY         *xGPU, *bGPU;
454   cusparseStatus_t stat;
455   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
456   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
457   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
458   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
459 
460   PetscFunctionBegin;
461   // Get the GPU pointers
462   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
463   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
464 
465   // solve
466   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
467   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
468 
469   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
470   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
471   ierr = WaitForGPU();CHKERRCUSP(ierr);
472   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
473   PetscFunctionReturn(0);
474 }
475 
476 #undef __FUNCT__
477 #define __FUNCT__ "MatCUSPARSECopyToGPU"
478 PetscErrorCode MatCUSPARSECopyToGPU(Mat A)
479 {
480 
481   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
482   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
483   PetscInt        m           = A->rmap->n,*ii,*ridx;
484   PetscErrorCode  ierr;
485 
486 
487   PetscFunctionBegin;
488   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
489     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
490     /*
491       It may be possible to reuse nonzero structure with new matrix values but
492       for simplicity and insured correctness we delete and build a new matrix on
493       the GPU. Likely a very small performance hit.
494     */
495     if (cusparseMat->mat){
496       try {
497 	delete cusparseMat->mat;
498 	if (cusparseMat->tempvec)
499 	  delete cusparseMat->tempvec;
500 
501       } catch(char* ex) {
502 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
503       }
504     }
505     try {
506       cusparseMat->nonzerorow=0;
507       for (int j = 0; j<m; j++)
508 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
509 
510       if (a->compressedrow.use) {
511 	m    = a->compressedrow.nrows;
512 	ii   = a->compressedrow.i;
513 	ridx = a->compressedrow.rindex;
514       } else {
515 	// Forcing compressed row on the GPU ... only relevant for CSR storage
516 	int k=0;
517 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
518 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
519 	ii[0]=0;
520 	for (int j = 0; j<m; j++) {
521 	  if ((a->i[j+1]-a->i[j])>0) {
522 	    ii[k] = a->i[j];
523 	    ridx[k]= j;
524 	    k++;
525 	  }
526 	}
527 	ii[cusparseMat->nonzerorow] = a->nz;
528 	m = cusparseMat->nonzerorow;
529       }
530 
531       // Build our matrix ... first determine the GPU storage type
532       cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format);
533 
534       // Create the streams and events (if desired).
535       PetscMPIInt    size;
536       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
537       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
538 
539       // FILL MODE UPPER is irrelevant
540       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
541 
542       // lastly, build the matrix
543       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
544       cusparseMat->mat->setCPRowIndices(ridx, m);
545 	//      }
546       if (!a->compressedrow.use) {
547 	// free data
548 	ierr = PetscFree(ii);CHKERRQ(ierr);
549 	ierr = PetscFree(ridx);CHKERRQ(ierr);
550       }
551       cusparseMat->tempvec = new CUSPARRAY;
552       cusparseMat->tempvec->resize(m);
553       //A->spptr = cusparseMat;
554     } catch(char* ex) {
555       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
556     }
557     ierr = WaitForGPU();CHKERRCUSP(ierr);
558     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
559     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
560   }
561   PetscFunctionReturn(0);
562 }
563 
564 // #undef __FUNCT__
565 // #define __FUNCT__ "MatCUSPARSECopyFromGPU"
566 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu)
567 // {
568 //   Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr;
569 //   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
570 //   PetscInt        m          = A->rmap->n;
571 //   PetscErrorCode  ierr;
572 
573 //   PetscFunctionBegin;
574 //   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
575 //     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
576 //       try {
577 //         cuspstruct->mat = Agpu;
578 //         if (a->compressedrow.use) {
579 //           //PetscInt *ii, *ridx;
580 //           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
581 //         } else {
582 //           PetscInt i;
583 
584 //           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
585 //           a->nz    = cuspstruct->mat->values.size();
586 //           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
587 //           A->preallocated = PETSC_TRUE;
588 //           // Copy ai, aj, aa
589 //           if (a->singlemalloc) {
590 //             if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
591 //           } else {
592 //             if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
593 //             if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
594 //             if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
595 //           }
596 //           ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
597 //           ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
598 //           a->singlemalloc = PETSC_TRUE;
599 //           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
600 //           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
601 //           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
602 //           // Setup row lengths
603 //           if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
604 //           ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
605 //           ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
606 //           for(i = 0; i < m; ++i) {
607 //             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
608 //           }
609 //           // a->diag?
610 //         }
611 //         cuspstruct->tempvec = new CUSPARRAY;
612 //         cuspstruct->tempvec->resize(m);
613 //       } catch(char *ex) {
614 //         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
615 //       }
616 //     }
617 //     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
618 //     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
619 //     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
620 //     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
621 //   } else {
622 //     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
623 //   }
624 //   PetscFunctionReturn(0);
625 // }
626 
627 #undef __FUNCT__
628 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
629 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
630 {
631   PetscErrorCode ierr;
632 
633   PetscFunctionBegin;
634 
635   if (right) {
636     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
637     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
638     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
639     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
640     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
641   }
642   if (left) {
643     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
644     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
645     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
646     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
647     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
648   }
649   PetscFunctionReturn(0);
650 }
651 
652 #undef __FUNCT__
653 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
654 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
655 {
656   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
657   PetscErrorCode ierr;
658   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
659   CUSPARRAY      *xarray,*yarray;
660 
661   PetscFunctionBegin;
662   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
663   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
664   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
665   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
666   try {
667     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
668   } catch (char* ex) {
669     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
670   }
671   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
672   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
673   if (!cusparseMat->mat->hasNonZeroStream()) {
674     ierr = WaitForGPU();CHKERRCUSP(ierr);
675   }
676   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
677   PetscFunctionReturn(0);
678 }
679 
680 
681 #undef __FUNCT__
682 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
683 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
684 {
685   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
686   PetscErrorCode ierr;
687   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
688   CUSPARRAY      *xarray,*yarray;
689 
690   PetscFunctionBegin;
691   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
692   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
693   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
694   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
695   try {
696 #if !defined(PETSC_USE_COMPLEX)
697     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
698 #else
699     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
700 #endif
701   } catch (char* ex) {
702     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
703   }
704   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
705   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
706   if (!cusparseMat->mat->hasNonZeroStream()) {
707     ierr = WaitForGPU();CHKERRCUSP(ierr);
708   }
709   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
710   PetscFunctionReturn(0);
711 }
712 
713 #undef __FUNCT__
714 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
715 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
716 {
717   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
718   PetscErrorCode ierr;
719   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
720   CUSPARRAY      *xarray,*yarray,*zarray;
721   PetscFunctionBegin;
722   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
723   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
724   try {
725     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
726     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
727     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
728     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
729 
730     // multiply add
731     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
732 
733     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
734     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
735     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
736 
737   } catch(char* ex) {
738     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
739   }
740   ierr = WaitForGPU();CHKERRCUSP(ierr);
741   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
742   PetscFunctionReturn(0);
743 }
744 
745 #undef __FUNCT__
746 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
747 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
748 {
749   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
750   PetscErrorCode ierr;
751   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
752   CUSPARRAY      *xarray,*yarray,*zarray;
753   PetscFunctionBegin;
754   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
755   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
756   try {
757     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
758     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
759     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
760     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
761 
762     // multiply add with matrix transpose
763 #if !defined(PETSC_USE_COMPLEX)
764     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
765 #else
766     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
767 #endif
768 
769     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
770     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
771     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
772 
773   } catch(char* ex) {
774     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
775   }
776   ierr = WaitForGPU();CHKERRCUSP(ierr);
777   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
778   PetscFunctionReturn(0);
779 }
780 
781 #undef __FUNCT__
782 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
783 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
784 {
785   PetscErrorCode  ierr;
786   PetscFunctionBegin;
787   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
788   ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
789   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
790   // this is not necessary since MatCUSPARSECopyToGPU has been called.
791   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
792   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
793   //}
794   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
795   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
796   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
797   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
798   PetscFunctionReturn(0);
799 }
800 
801 /* --------------------------------------------------------------------------------*/
802 #undef __FUNCT__
803 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
804 /*@C
805    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
806    (the default parallel PETSc format).  For good matrix assembly performance
807    the user should preallocate the matrix storage by setting the parameter nz
808    (or the array nnz).  By setting these parameters accurately, performance
809    during matrix assembly can be increased by more than a factor of 50.
810 
811    Collective on MPI_Comm
812 
813    Input Parameters:
814 +  comm - MPI communicator, set to PETSC_COMM_SELF
815 .  m - number of rows
816 .  n - number of columns
817 .  nz - number of nonzeros per row (same for all rows)
818 -  nnz - array containing the number of nonzeros in the various rows
819          (possibly different for each row) or PETSC_NULL
820 
821    Output Parameter:
822 .  A - the matrix
823 
824    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
825    MatXXXXSetPreallocation() paradgm instead of this routine directly.
826    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
827 
828    Notes:
829    If nnz is given then nz is ignored
830 
831    The AIJ format (also called the Yale sparse matrix format or
832    compressed row storage), is fully compatible with standard Fortran 77
833    storage.  That is, the stored row and column indices can begin at
834    either one (as in Fortran) or zero.  See the users' manual for details.
835 
836    Specify the preallocated storage with either nz or nnz (not both).
837    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
838    allocation.  For large problems you MUST preallocate memory or you
839    will get TERRIBLE performance, see the users' manual chapter on matrices.
840 
841    By default, this format uses inodes (identical nodes) when possible, to
842    improve numerical efficiency of matrix-vector products and solves. We
843    search for consecutive rows with the same nonzero structure, thereby
844    reusing matrix information to achieve increased efficiency.
845 
846    Level: intermediate
847 
848 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
849 
850 @*/
851 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
852 {
853   PetscErrorCode ierr;
854 
855   PetscFunctionBegin;
856   ierr = MatCreate(comm,A);CHKERRQ(ierr);
857   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
858   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
859   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
860   PetscFunctionReturn(0);
861 }
862 
863 #undef __FUNCT__
864 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
865 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
866 {
867   PetscErrorCode        ierr;
868   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
869 
870   PetscFunctionBegin;
871   if (A->factortype==MAT_FACTOR_NONE) {
872     // The regular matrices
873     try {
874       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
875 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
876       }
877       if (cusparseMat->tempvec!=0)
878 	delete cusparseMat->tempvec;
879       delete cusparseMat;
880       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
881     } catch(char* ex) {
882       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
883     }
884   } else {
885     // The triangular factors
886     try {
887       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
888       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
889       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
890       // destroy the matrix and the container
891       delete (GPU_Matrix_Ifc *)cusparseMatLo;
892       delete (GPU_Matrix_Ifc *)cusparseMatUp;
893       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
894       delete cusparseTriFactors;
895     } catch(char* ex) {
896       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
897     }
898   }
899   if (MAT_cusparseHandle) {
900     cusparseStatus_t stat;
901     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
902     MAT_cusparseHandle=0;
903   }
904   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
905   A->spptr = 0;
906 
907   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
908   PetscFunctionReturn(0);
909 }
910 
911 
912 EXTERN_C_BEGIN
913 #undef __FUNCT__
914 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
915 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
916 {
917   PetscErrorCode ierr;
918   GPUStorageFormat format = CSR;
919 
920   PetscFunctionBegin;
921   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
922   if (B->factortype==MAT_FACTOR_NONE) {
923     /* you cannot check the inode.use flag here since the matrix was just created.*/
924     // now build a GPU matrix data structure
925     B->spptr        = new Mat_SeqAIJCUSPARSE;
926     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
927     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
928     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format;
929   } else {
930     /* NEXT, set the pointers to the triangular factors */
931     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
932     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
933     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
934     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
935     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format;
936   }
937   // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...)
938   if (!MAT_cusparseHandle) {
939     cusparseStatus_t stat;
940     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
941   }
942   // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
943   // default cusparse tri solve. Note the difference with the implementation in
944   // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu
945   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
946   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
947   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
948   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
949   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
950   B->ops->setoption        = MatSetOption_SeqAIJCUSPARSE;
951   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
952   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
953   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
954   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
955   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
956   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
957   PetscFunctionReturn(0);
958 }
959 EXTERN_C_END
960 
961