xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision dec1416f15364d8a66cef6f4b2a5a2aba5192d13)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8 {
9   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11   PetscErrorCode     ierr;
12   PetscInt           i;
13 
14   PetscFunctionBegin;
15   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
16   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
17   if (d_nnz) {
18     for (i=0; i<B->rmap->n; i++) {
19       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
20     }
21   }
22   if (o_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
25     }
26   }
27   if (!B->preallocated) {
28     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
29     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
30     ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr);
31     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
32     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
33     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
34     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
35     ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr);
36     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
37     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
38     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
39   }
40   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
41   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
42   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
48 
49   B->preallocated = PETSC_TRUE;
50   PetscFunctionReturn(0);
51 }
52 
53 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
54 {
55   /*
56      This multiplication sequence is different sequence
57      than the CPU version. In particular, the diagonal block
58      multiplication kernel is launched in one stream. Then,
59      in a separate stream, the data transfers from DeviceToHost
60      (with MPI messaging in between), then HostToDevice are
61      launched. Once the data transfer stream is synchronized,
62      to ensure messaging is complete, the MatMultAdd kernel
63      is launched in the original (MatMult) stream to protect
64      against race conditions.
65   */
66   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
67   PetscErrorCode ierr;
68   PetscInt       nt;
69 
70   PetscFunctionBegin;
71   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
72   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
73   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
74   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
75   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
76   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
77   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
78   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
79   PetscFunctionReturn(0);
80 }
81 
82 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
83 {
84   /*
85      This multiplication sequence is different sequence
86      than the CPU version. In particular, the diagonal block
87      multiplication kernel is launched in one stream. Then,
88      in a separate stream, the data transfers from DeviceToHost
89      (with MPI messaging in between), then HostToDevice are
90      launched. Once the data transfer stream is synchronized,
91      to ensure messaging is complete, the MatMultAdd kernel
92      is launched in the original (MatMult) stream to protect
93      against race conditions.
94   */
95   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
96   PetscErrorCode ierr;
97   PetscInt       nt;
98 
99   PetscFunctionBegin;
100   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
102   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
103   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
104   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
107   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
108   PetscFunctionReturn(0);
109 }
110 
111 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
112 {
113   /* This multiplication sequence is different sequence
114      than the CPU version. In particular, the diagonal block
115      multiplication kernel is launched in one stream. Then,
116      in a separate stream, the data transfers from DeviceToHost
117      (with MPI messaging in between), then HostToDevice are
118      launched. Once the data transfer stream is synchronized,
119      to ensure messaging is complete, the MatMultAdd kernel
120      is launched in the original (MatMult) stream to protect
121      against race conditions.
122 
123      This sequence should only be called for GPU computation. */
124   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
125   PetscErrorCode ierr;
126   PetscInt       nt;
127 
128   PetscFunctionBegin;
129   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
130   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
131   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
132   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
133   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
134   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
135   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
136   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
137   PetscFunctionReturn(0);
138 }
139 
140 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
141 {
142   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
143   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
144 
145   PetscFunctionBegin;
146   switch (op) {
147   case MAT_CUSPARSE_MULT_DIAG:
148     cusparseStruct->diagGPUMatFormat = format;
149     break;
150   case MAT_CUSPARSE_MULT_OFFDIAG:
151     cusparseStruct->offdiagGPUMatFormat = format;
152     break;
153   case MAT_CUSPARSE_ALL:
154     cusparseStruct->diagGPUMatFormat    = format;
155     cusparseStruct->offdiagGPUMatFormat = format;
156     break;
157   default:
158     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
159   }
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
164 {
165   MatCUSPARSEStorageFormat format;
166   PetscErrorCode           ierr;
167   PetscBool                flg;
168   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
169   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
170 
171   PetscFunctionBegin;
172   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
173   if (A->factortype==MAT_FACTOR_NONE) {
174     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
175                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
176     if (flg) {
177       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
178     }
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
183     }
184     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
185                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186     if (flg) {
187       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
188     }
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 }
193 
194 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
195 {
196   PetscErrorCode ierr;
197   Mat_MPIAIJ     *mpiaij;
198 
199   PetscFunctionBegin;
200   mpiaij = (Mat_MPIAIJ*)A->data;
201   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
202   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
203     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
204   }
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
209 {
210   PetscErrorCode     ierr;
211   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
212   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
213   cudaError_t        err;
214   cusparseStatus_t   stat;
215 
216   PetscFunctionBegin;
217   try {
218     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
219     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
220     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
221     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
222     delete cusparseStruct;
223   } catch(char *ex) {
224     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
225   }
226   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
231 {
232   PetscErrorCode     ierr;
233   Mat_MPIAIJ         *a;
234   Mat_MPIAIJCUSPARSE * cusparseStruct;
235   cudaError_t        err;
236   cusparseStatus_t   stat;
237 
238   PetscFunctionBegin;
239   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
240   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
241   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
242   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
243 
244   a        = (Mat_MPIAIJ*)A->data;
245   a->spptr = new Mat_MPIAIJCUSPARSE;
246 
247   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
248   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
249   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
250   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
251   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
252 
253   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
254   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
255   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
256   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
257   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
258   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
259 
260   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
261   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
262   PetscFunctionReturn(0);
263 }
264 
265 /*@
266    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
267    (the default parallel PETSc format).  This matrix will ultimately pushed down
268    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
269    assembly performance the user should preallocate the matrix storage by setting
270    the parameter nz (or the array nnz).  By setting these parameters accurately,
271    performance during matrix assembly can be increased by more than a factor of 50.
272 
273    Collective
274 
275    Input Parameters:
276 +  comm - MPI communicator, set to PETSC_COMM_SELF
277 .  m - number of rows
278 .  n - number of columns
279 .  nz - number of nonzeros per row (same for all rows)
280 -  nnz - array containing the number of nonzeros in the various rows
281          (possibly different for each row) or NULL
282 
283    Output Parameter:
284 .  A - the matrix
285 
286    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
287    MatXXXXSetPreallocation() paradigm instead of this routine directly.
288    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
289 
290    Notes:
291    If nnz is given then nz is ignored
292 
293    The AIJ format (also called the Yale sparse matrix format or
294    compressed row storage), is fully compatible with standard Fortran 77
295    storage.  That is, the stored row and column indices can begin at
296    either one (as in Fortran) or zero.  See the users' manual for details.
297 
298    Specify the preallocated storage with either nz or nnz (not both).
299    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
300    allocation.  For large problems you MUST preallocate memory or you
301    will get TERRIBLE performance, see the users' manual chapter on matrices.
302 
303    By default, this format uses inodes (identical nodes) when possible, to
304    improve numerical efficiency of matrix-vector products and solves. We
305    search for consecutive rows with the same nonzero structure, thereby
306    reusing matrix information to achieve increased efficiency.
307 
308    Level: intermediate
309 
310 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
311 @*/
312 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
313 {
314   PetscErrorCode ierr;
315   PetscMPIInt    size;
316 
317   PetscFunctionBegin;
318   ierr = MatCreate(comm,A);CHKERRQ(ierr);
319   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
320   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
321   if (size > 1) {
322     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
323     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
324   } else {
325     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
326     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
327   }
328   PetscFunctionReturn(0);
329 }
330 
331 /*MC
332    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
333 
334    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
335    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
336    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
337 
338    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
339    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
340    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
341    for communicators controlling multiple processes.  It is recommended that you call both of
342    the above preallocation routines for simplicity.
343 
344    Options Database Keys:
345 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
346 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
347 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
348 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
349 
350   Level: beginner
351 
352  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
353 M
354 M*/
355