xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision c524e44ceafee4aa1f13666ee553c16f4e41b8f4)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
19   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
20   if (d_nnz) {
21     for (i=0; i<B->rmap->n; i++) {
22       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23     }
24   }
25   if (o_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28     }
29   }
30   if (!B->preallocated) {
31     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
35     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
36     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
37     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
38     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
39     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
40   }
41   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
42   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
43   if (b->lvec) {
44     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
45   }
46   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
47   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
51   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
52   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
53   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
54 
55   B->preallocated = PETSC_TRUE;
56   PetscFunctionReturn(0);
57 }
58 
59 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
60 {
61   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
62   PetscErrorCode ierr;
63   PetscInt       nt;
64 
65   PetscFunctionBegin;
66   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
67   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
68   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
69   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
70   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
71   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
72   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
73   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
78 {
79   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   if (A->factortype == MAT_FACTOR_NONE) {
84     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
85     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
86     if (d_mat) {
87       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
88       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
89       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
90       cudaError_t  err;
91       PetscScalar  *vals;
92       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
93       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
94       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
95       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
96       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
97     }
98   }
99   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
100   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
101 
102   PetscFunctionReturn(0);
103 }
104 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
105 {
106   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
107   PetscErrorCode ierr;
108   PetscInt       nt;
109 
110   PetscFunctionBegin;
111   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
112   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
113   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
114   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
115   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
116   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
117   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
118   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
123 {
124   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
125   PetscErrorCode ierr;
126   PetscInt       nt;
127 
128   PetscFunctionBegin;
129   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
130   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
131   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
132   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
133   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
134   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
135   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
136   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
137   PetscFunctionReturn(0);
138 }
139 
140 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
141 {
142   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
143   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
144 
145   PetscFunctionBegin;
146   switch (op) {
147   case MAT_CUSPARSE_MULT_DIAG:
148     cusparseStruct->diagGPUMatFormat = format;
149     break;
150   case MAT_CUSPARSE_MULT_OFFDIAG:
151     cusparseStruct->offdiagGPUMatFormat = format;
152     break;
153   case MAT_CUSPARSE_ALL:
154     cusparseStruct->diagGPUMatFormat    = format;
155     cusparseStruct->offdiagGPUMatFormat = format;
156     break;
157   default:
158     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
159   }
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
164 {
165   MatCUSPARSEStorageFormat format;
166   PetscErrorCode           ierr;
167   PetscBool                flg;
168   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
169   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
170 
171   PetscFunctionBegin;
172   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
173   if (A->factortype==MAT_FACTOR_NONE) {
174     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
175                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
176     if (flg) {
177       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
178     }
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
183     }
184     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
185                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186     if (flg) {
187       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
188     }
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 }
193 
194 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
195 {
196   PetscErrorCode             ierr;
197   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
198   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
199   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
200   PetscInt                    nnz_state = A->nonzerostate;
201   PetscFunctionBegin;
202   if (d_mat) {
203     cudaError_t                err;
204     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
205   }
206   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
207   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
208     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
209   }
210   if (nnz_state > A->nonzerostate) {
211     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
212   }
213 
214   PetscFunctionReturn(0);
215 }
216 
217 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
218 {
219   PetscErrorCode     ierr;
220   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
221   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
222   cudaError_t        err;
223   cusparseStatus_t   stat;
224 
225   PetscFunctionBegin;
226   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
227   if (cusparseStruct->deviceMat) {
228     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
229     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
230     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
231     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
232     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
233     if (jaca->compressedrow.use) {
234       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
235     }
236     if (jacb->compressedrow.use) {
237       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
238     }
239     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
240     err = cudaFree(d_mat);CHKERRCUDA(err);
241   }
242   try {
243     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
244     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
245     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
246     if (cusparseStruct->stream) {
247       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
248     }
249     delete cusparseStruct;
250   } catch(char *ex) {
251     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
252   }
253   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
254   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
255   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
259 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
260 {
261   PetscErrorCode     ierr;
262   Mat_MPIAIJ         *a;
263   Mat_MPIAIJCUSPARSE *cusparseStruct;
264   cusparseStatus_t   stat;
265   Mat                A;
266 
267   PetscFunctionBegin;
268   if (reuse == MAT_INITIAL_MATRIX) {
269     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
270   } else if (reuse == MAT_REUSE_MATRIX) {
271     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
272   }
273   A = *newmat;
274 
275   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
276   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
277 
278   a = (Mat_MPIAIJ*)A->data;
279   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
280   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
281   if (a->lvec) {
282     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
283   }
284 
285   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
286     a->spptr = new Mat_MPIAIJCUSPARSE;
287 
288     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
289     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
290     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
291     cusparseStruct->stream              = 0;
292     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
293     cusparseStruct->deviceMat = NULL;
294   }
295 
296   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
297   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
298   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
299   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
300   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
301   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
302   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
303 
304   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
305   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
306   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
311 {
312   PetscErrorCode ierr;
313 
314   PetscFunctionBegin;
315   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
316   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
317   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
318   PetscFunctionReturn(0);
319 }
320 
321 /*@
322    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
323    (the default parallel PETSc format).  This matrix will ultimately pushed down
324    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
325    assembly performance the user should preallocate the matrix storage by setting
326    the parameter nz (or the array nnz).  By setting these parameters accurately,
327    performance during matrix assembly can be increased by more than a factor of 50.
328 
329    Collective
330 
331    Input Parameters:
332 +  comm - MPI communicator, set to PETSC_COMM_SELF
333 .  m - number of rows
334 .  n - number of columns
335 .  nz - number of nonzeros per row (same for all rows)
336 -  nnz - array containing the number of nonzeros in the various rows
337          (possibly different for each row) or NULL
338 
339    Output Parameter:
340 .  A - the matrix
341 
342    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
343    MatXXXXSetPreallocation() paradigm instead of this routine directly.
344    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
345 
346    Notes:
347    If nnz is given then nz is ignored
348 
349    The AIJ format (also called the Yale sparse matrix format or
350    compressed row storage), is fully compatible with standard Fortran 77
351    storage.  That is, the stored row and column indices can begin at
352    either one (as in Fortran) or zero.  See the users' manual for details.
353 
354    Specify the preallocated storage with either nz or nnz (not both).
355    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
356    allocation.  For large problems you MUST preallocate memory or you
357    will get TERRIBLE performance, see the users' manual chapter on matrices.
358 
359    By default, this format uses inodes (identical nodes) when possible, to
360    improve numerical efficiency of matrix-vector products and solves. We
361    search for consecutive rows with the same nonzero structure, thereby
362    reusing matrix information to achieve increased efficiency.
363 
364    Level: intermediate
365 
366 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
367 @*/
368 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
369 {
370   PetscErrorCode ierr;
371   PetscMPIInt    size;
372 
373   PetscFunctionBegin;
374   ierr = MatCreate(comm,A);CHKERRQ(ierr);
375   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
376   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
377   if (size > 1) {
378     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
379     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
380   } else {
381     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
382     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
383   }
384   PetscFunctionReturn(0);
385 }
386 
387 /*MC
388    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
389 
390    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
391    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
392    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
393 
394    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
395    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
396    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
397    for communicators controlling multiple processes.  It is recommended that you call both of
398    the above preallocation routines for simplicity.
399 
400    Options Database Keys:
401 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
402 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
403 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
404 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
405 
406   Level: beginner
407 
408  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
409 M
410 M*/
411 
412 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
413 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
414 {
415 #if defined(PETSC_USE_CTABLE)
416   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
417 #else
418   PetscSplitCSRDataStructure **p_d_mat;
419   PetscMPIInt                size,rank;
420   MPI_Comm                   comm;
421   PetscErrorCode             ierr;
422   int                        *ai,*bi,*aj,*bj;
423   PetscScalar                *aa,*ba;
424 
425   PetscFunctionBegin;
426   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
427   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
428   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
429   if (A->factortype == MAT_FACTOR_NONE) {
430     CsrMatrix *matrixA,*matrixB=NULL;
431     if (size == 1) {
432       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
433       p_d_mat = &cusparsestruct->deviceMat;
434       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
435       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
436         matrixA = (CsrMatrix*)matstruct->mat;
437         bi = bj = NULL; ba = NULL;
438       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
439     } else {
440       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
441       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
442       p_d_mat = &spptr->deviceMat;
443       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
444       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
445       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
446       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
447       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
448         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
449         matrixA = (CsrMatrix*)matstructA->mat;
450         matrixB = (CsrMatrix*)matstructB->mat;
451         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
452         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
453         ba = thrust::raw_pointer_cast(matrixB->values->data());
454       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
455     }
456     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
457     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
458     aa = thrust::raw_pointer_cast(matrixA->values->data());
459   } else {
460     *B = NULL;
461     PetscFunctionReturn(0);
462   }
463   // act like MatSetValues because not called on host
464   if (A->assembled) {
465     if (A->was_assembled) {
466       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
467     }
468     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
469   } else {
470     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
471   }
472   if (!*p_d_mat) {
473     cudaError_t                 err;
474     PetscSplitCSRDataStructure  *d_mat, h_mat;
475     Mat_SeqAIJ                  *jaca;
476     PetscInt                    n = A->rmap->n, nnz;
477     // create and copy
478     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
479     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
480     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
481     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
482     if (size == 1) {
483       jaca = (Mat_SeqAIJ*)A->data;
484       h_mat.nonzerostate = A->nonzerostate;
485       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
486       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
487       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
488       h_mat.offdiag.a = NULL;
489       h_mat.seq = PETSC_TRUE;
490     } else {
491       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
492       Mat_SeqAIJ  *jacb;
493       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
494       jaca = (Mat_SeqAIJ*)aij->A->data;
495       jacb = (Mat_SeqAIJ*)aij->B->data;
496       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
497       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
498       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
499       // create colmap - this is ussually done (lazy) in MatSetValues
500       aij->donotstash = PETSC_TRUE;
501       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
502       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
503       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
504       aij->colmap[A->cmap->N] = -9;
505       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
506       {
507         PetscInt ii;
508         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
509       }
510       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
511       // allocate B copy data
512       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
513       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
514       nnz = jacb->i[n];
515 
516       if (jacb->compressedrow.use) {
517         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
518         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
519       } else h_mat.offdiag.i = bi;
520       h_mat.offdiag.j = bj;
521       h_mat.offdiag.a = ba;
522 
523       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
524       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
525       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
526       h_mat.offdiag.n = n;
527     }
528     // allocate A copy data
529     nnz = jaca->i[n];
530     h_mat.diag.n = n;
531     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
532     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
533     if (jaca->compressedrow.use) {
534       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
535       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
536     } else {
537       h_mat.diag.i = ai;
538     }
539     h_mat.diag.j = aj;
540     h_mat.diag.a = aa;
541     // copy pointers and metdata to device
542     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
543     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
544   } else {
545     *B = *p_d_mat;
546   }
547   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
548   PetscFunctionReturn(0);
549 #endif
550 }
551