xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision cc12936ac9377d1b0cd35ec09a88dcc1b6e8f816)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8 {
9   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11   PetscErrorCode     ierr;
12   PetscInt           i;
13 
14   PetscFunctionBegin;
15   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
16   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
17   if (d_nnz) {
18     for (i=0; i<B->rmap->n; i++) {
19       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
20     }
21   }
22   if (o_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
25     }
26   }
27   if (!B->preallocated) {
28     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
29     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
30     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
31     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
33     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
35     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
37   }
38   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
39   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
40   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
41   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
42   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
46 
47   B->preallocated = PETSC_TRUE;
48   PetscFunctionReturn(0);
49 }
50 
51 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
52 {
53   /* This multiplication sequence is different sequence
54      than the CPU version. In particular, the diagonal block
55      multiplication kernel is launched in one stream. Then,
56      in a separate stream, the data transfers from DeviceToHost
57      (with MPI messaging in between), then HostToDevice are
58      launched. Once the data transfer stream is synchronized,
59      to ensure messaging is complete, the MatMultAdd kernel
60      is launched in the original (MatMult) stream to protect
61      against race conditions.
62 
63      This sequence should only be called for GPU computation. */
64   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       nt;
67 
68   PetscFunctionBegin;
69   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
70   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
71   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
72   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
73   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
74   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
75   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
76   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
77   PetscFunctionReturn(0);
78 }
79 
80 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
81 {
82   /* This multiplication sequence is different sequence
83      than the CPU version. In particular, the diagonal block
84      multiplication kernel is launched in one stream. Then,
85      in a separate stream, the data transfers from DeviceToHost
86      (with MPI messaging in between), then HostToDevice are
87      launched. Once the data transfer stream is synchronized,
88      to ensure messaging is complete, the MatMultAdd kernel
89      is launched in the original (MatMult) stream to protect
90      against race conditions.
91 
92      This sequence should only be called for GPU computation. */
93   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
94   PetscErrorCode ierr;
95   PetscInt       nt;
96 
97   PetscFunctionBegin;
98   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
99   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr);
100   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
101   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
102   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
103   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
104   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
109 {
110   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
111   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
112 
113   PetscFunctionBegin;
114   switch (op) {
115   case MAT_CUSPARSE_MULT_DIAG:
116     cusparseStruct->diagGPUMatFormat = format;
117     break;
118   case MAT_CUSPARSE_MULT_OFFDIAG:
119     cusparseStruct->offdiagGPUMatFormat = format;
120     break;
121   case MAT_CUSPARSE_ALL:
122     cusparseStruct->diagGPUMatFormat    = format;
123     cusparseStruct->offdiagGPUMatFormat = format;
124     break;
125   default:
126     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
127   }
128   PetscFunctionReturn(0);
129 }
130 
131 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
132 {
133   MatCUSPARSEStorageFormat format;
134   PetscErrorCode           ierr;
135   PetscBool                flg;
136   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
137   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
138 
139   PetscFunctionBegin;
140   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
141   if (A->factortype==MAT_FACTOR_NONE) {
142     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
143                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
144     if (flg) {
145       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
146     }
147     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
148                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
149     if (flg) {
150       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
151     }
152     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
153                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
154     if (flg) {
155       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
156     }
157   }
158   ierr = PetscOptionsTail();CHKERRQ(ierr);
159   PetscFunctionReturn(0);
160 }
161 
162 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
163 {
164   PetscErrorCode ierr;
165   Mat_MPIAIJ     *mpiaij;
166 
167   PetscFunctionBegin;
168   mpiaij = (Mat_MPIAIJ*)A->data;
169   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
170   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
171     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
172   }
173   PetscFunctionReturn(0);
174 }
175 
176 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
177 {
178   PetscErrorCode     ierr;
179   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
180   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
181   cudaError_t        err;
182   cusparseStatus_t   stat;
183 
184   PetscFunctionBegin;
185   try {
186     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
187     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
188     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
189     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
190     delete cusparseStruct;
191   } catch(char *ex) {
192     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
193   }
194   cusparseStruct = 0;
195 
196   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
197   PetscFunctionReturn(0);
198 }
199 
200 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
201 {
202   PetscErrorCode     ierr;
203   Mat_MPIAIJ         *a;
204   Mat_MPIAIJCUSPARSE * cusparseStruct;
205   cudaError_t        err;
206   cusparseStatus_t   stat;
207 
208   PetscFunctionBegin;
209   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
210   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
211   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
212   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
213 
214   a        = (Mat_MPIAIJ*)A->data;
215   a->spptr = new Mat_MPIAIJCUSPARSE;
216 
217   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
218   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
219   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
220   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
221   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
222 
223   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
224   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
225   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
226   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
227   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
228 
229   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
230   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
231   PetscFunctionReturn(0);
232 }
233 
234 /*@
235    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
236    (the default parallel PETSc format).  This matrix will ultimately pushed down
237    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
238    assembly performance the user should preallocate the matrix storage by setting
239    the parameter nz (or the array nnz).  By setting these parameters accurately,
240    performance during matrix assembly can be increased by more than a factor of 50.
241 
242    Collective on MPI_Comm
243 
244    Input Parameters:
245 +  comm - MPI communicator, set to PETSC_COMM_SELF
246 .  m - number of rows
247 .  n - number of columns
248 .  nz - number of nonzeros per row (same for all rows)
249 -  nnz - array containing the number of nonzeros in the various rows
250          (possibly different for each row) or NULL
251 
252    Output Parameter:
253 .  A - the matrix
254 
255    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
256    MatXXXXSetPreallocation() paradigm instead of this routine directly.
257    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
258 
259    Notes:
260    If nnz is given then nz is ignored
261 
262    The AIJ format (also called the Yale sparse matrix format or
263    compressed row storage), is fully compatible with standard Fortran 77
264    storage.  That is, the stored row and column indices can begin at
265    either one (as in Fortran) or zero.  See the users' manual for details.
266 
267    Specify the preallocated storage with either nz or nnz (not both).
268    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
269    allocation.  For large problems you MUST preallocate memory or you
270    will get TERRIBLE performance, see the users' manual chapter on matrices.
271 
272    By default, this format uses inodes (identical nodes) when possible, to
273    improve numerical efficiency of matrix-vector products and solves. We
274    search for consecutive rows with the same nonzero structure, thereby
275    reusing matrix information to achieve increased efficiency.
276 
277    Level: intermediate
278 
279 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
280 @*/
281 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
282 {
283   PetscErrorCode ierr;
284   PetscMPIInt    size;
285 
286   PetscFunctionBegin;
287   ierr = MatCreate(comm,A);CHKERRQ(ierr);
288   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
289   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
290   if (size > 1) {
291     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
292     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
293   } else {
294     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
295     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
296   }
297   PetscFunctionReturn(0);
298 }
299 
300 /*MC
301    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
302 
303    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
304    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
305    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
306 
307    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
308    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
309    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
310    for communicators controlling multiple processes.  It is recommended that you call both of
311    the above preallocation routines for simplicity.
312 
313    Options Database Keys:
314 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
315 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
316 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
317 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
318 
319   Level: beginner
320 
321  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
322 M
323 M*/
324