xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 1d36bdfd46c3877894678bf6fced88766689f9b6)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8 {
9   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11   PetscErrorCode     ierr;
12   PetscInt           i;
13 
14   PetscFunctionBegin;
15   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
16   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
17   if (d_nnz) {
18     for (i=0; i<B->rmap->n; i++) {
19       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
20     }
21   }
22   if (o_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
25     }
26   }
27   if (!B->preallocated) {
28     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
29     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
30     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
31     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
33     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
35     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
37   }
38   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
39   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
40   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
41   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
42   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
46 
47   B->preallocated = PETSC_TRUE;
48   PetscFunctionReturn(0);
49 }
50 
51 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
52 {
53   /* This multiplication sequence is different sequence
54      than the CPU version. In particular, the diagonal block
55      multiplication kernel is launched in one stream. Then,
56      in a separate stream, the data transfers from DeviceToHost
57      (with MPI messaging in between), then HostToDevice are
58      launched. Once the data transfer stream is synchronized,
59      to ensure messaging is complete, the MatMultAdd kernel
60      is launched in the original (MatMult) stream to protect
61      against race conditions.
62 
63      This sequence should only be called for GPU computation. */
64   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       nt;
67 
68   PetscFunctionBegin;
69   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
70   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
71   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
72   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
73   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
74   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
75   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
76   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
77   PetscFunctionReturn(0);
78 }
79 
80 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
81 {
82   /* This multiplication sequence is different sequence
83      than the CPU version. In particular, the diagonal block
84      multiplication kernel is launched in one stream. Then,
85      in a separate stream, the data transfers from DeviceToHost
86      (with MPI messaging in between), then HostToDevice are
87      launched. Once the data transfer stream is synchronized,
88      to ensure messaging is complete, the MatMultAdd kernel
89      is launched in the original (MatMult) stream to protect
90      against race conditions.
91 
92      This sequence should only be called for GPU computation. */
93   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
94   PetscErrorCode ierr;
95   PetscInt       nt;
96 
97   PetscFunctionBegin;
98   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
99   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
100   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr);
101   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
102   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
103   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
104   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
105   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
110 {
111   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
112   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
113 
114   PetscFunctionBegin;
115   switch (op) {
116   case MAT_CUSPARSE_MULT_DIAG:
117     cusparseStruct->diagGPUMatFormat = format;
118     break;
119   case MAT_CUSPARSE_MULT_OFFDIAG:
120     cusparseStruct->offdiagGPUMatFormat = format;
121     break;
122   case MAT_CUSPARSE_ALL:
123     cusparseStruct->diagGPUMatFormat    = format;
124     cusparseStruct->offdiagGPUMatFormat = format;
125     break;
126   default:
127     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
128   }
129   PetscFunctionReturn(0);
130 }
131 
132 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
133 {
134   MatCUSPARSEStorageFormat format;
135   PetscErrorCode           ierr;
136   PetscBool                flg;
137   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
138   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
139 
140   PetscFunctionBegin;
141   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
142   if (A->factortype==MAT_FACTOR_NONE) {
143     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
144                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
145     if (flg) {
146       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
147     }
148     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
149                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
150     if (flg) {
151       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
152     }
153     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
154                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
155     if (flg) {
156       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
157     }
158   }
159   ierr = PetscOptionsTail();CHKERRQ(ierr);
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
164 {
165   PetscErrorCode ierr;
166   Mat_MPIAIJ     *mpiaij;
167 
168   PetscFunctionBegin;
169   mpiaij = (Mat_MPIAIJ*)A->data;
170   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
171   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
172     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
178 {
179   PetscErrorCode     ierr;
180   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
181   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
182   cudaError_t        err;
183   cusparseStatus_t   stat;
184 
185   PetscFunctionBegin;
186   try {
187     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
188     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
189     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
190     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
191     delete cusparseStruct;
192   } catch(char *ex) {
193     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
194   }
195   cusparseStruct = 0;
196 
197   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
202 {
203   PetscErrorCode     ierr;
204   Mat_MPIAIJ         *a;
205   Mat_MPIAIJCUSPARSE * cusparseStruct;
206   cudaError_t        err;
207   cusparseStatus_t   stat;
208 
209   PetscFunctionBegin;
210   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
211   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
212   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
213   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
214 
215   a        = (Mat_MPIAIJ*)A->data;
216   a->spptr = new Mat_MPIAIJCUSPARSE;
217 
218   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
219   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
220   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
221   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
222   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
223 
224   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
225   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
226   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
227   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
228   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
229 
230   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
231   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 /*@
236    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
237    (the default parallel PETSc format).  This matrix will ultimately pushed down
238    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
239    assembly performance the user should preallocate the matrix storage by setting
240    the parameter nz (or the array nnz).  By setting these parameters accurately,
241    performance during matrix assembly can be increased by more than a factor of 50.
242 
243    Collective on MPI_Comm
244 
245    Input Parameters:
246 +  comm - MPI communicator, set to PETSC_COMM_SELF
247 .  m - number of rows
248 .  n - number of columns
249 .  nz - number of nonzeros per row (same for all rows)
250 -  nnz - array containing the number of nonzeros in the various rows
251          (possibly different for each row) or NULL
252 
253    Output Parameter:
254 .  A - the matrix
255 
256    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
257    MatXXXXSetPreallocation() paradigm instead of this routine directly.
258    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
259 
260    Notes:
261    If nnz is given then nz is ignored
262 
263    The AIJ format (also called the Yale sparse matrix format or
264    compressed row storage), is fully compatible with standard Fortran 77
265    storage.  That is, the stored row and column indices can begin at
266    either one (as in Fortran) or zero.  See the users' manual for details.
267 
268    Specify the preallocated storage with either nz or nnz (not both).
269    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
270    allocation.  For large problems you MUST preallocate memory or you
271    will get TERRIBLE performance, see the users' manual chapter on matrices.
272 
273    By default, this format uses inodes (identical nodes) when possible, to
274    improve numerical efficiency of matrix-vector products and solves. We
275    search for consecutive rows with the same nonzero structure, thereby
276    reusing matrix information to achieve increased efficiency.
277 
278    Level: intermediate
279 
280 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
281 @*/
282 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
283 {
284   PetscErrorCode ierr;
285   PetscMPIInt    size;
286 
287   PetscFunctionBegin;
288   ierr = MatCreate(comm,A);CHKERRQ(ierr);
289   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
290   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
291   if (size > 1) {
292     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
293     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
294   } else {
295     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
296     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
297   }
298   PetscFunctionReturn(0);
299 }
300 
301 /*MC
302    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
303 
304    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
305    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
306    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
307 
308    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
309    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
310    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
311    for communicators controlling multiple processes.  It is recommended that you call both of
312    the above preallocation routines for simplicity.
313 
314    Options Database Keys:
315 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
316 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
317 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
318 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
319 
320   Level: beginner
321 
322  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
323 M
324 M*/
325