xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision effebcaf36779a7960a3371bb2c397d1b7c25a76)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 
8 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
9 {
10   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
11   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
12   PetscErrorCode     ierr;
13   PetscInt           i;
14 
15   PetscFunctionBegin;
16   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
17   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
18   if (d_nnz) {
19     for (i=0; i<B->rmap->n; i++) {
20       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
21     }
22   }
23   if (o_nnz) {
24     for (i=0; i<B->rmap->n; i++) {
25       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
26     }
27   }
28   if (!B->preallocated) {
29     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
30     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
31     ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr);
32     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
33     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
34     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
35     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
36     ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr);
37     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
38     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
39     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
40   }
41   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
42   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
49 
50   B->preallocated = PETSC_TRUE;
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
55 {
56   /*
57      This multiplication sequence is different sequence
58      than the CPU version. In particular, the diagonal block
59      multiplication kernel is launched in one stream. Then,
60      in a separate stream, the data transfers from DeviceToHost
61      (with MPI messaging in between), then HostToDevice are
62      launched. Once the data transfer stream is synchronized,
63      to ensure messaging is complete, the MatMultAdd kernel
64      is launched in the original (MatMult) stream to protect
65      against race conditions.
66   */
67   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
68   PetscErrorCode ierr;
69   PetscInt       nt;
70 
71   PetscFunctionBegin;
72   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
73   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
74   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
75   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
76   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
77   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
78   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
79   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
80   PetscFunctionReturn(0);
81 }
82 
83 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
84 {
85   /*
86      This multiplication sequence is different sequence
87      than the CPU version. In particular, the diagonal block
88      multiplication kernel is launched in one stream. Then,
89      in a separate stream, the data transfers from DeviceToHost
90      (with MPI messaging in between), then HostToDevice are
91      launched. Once the data transfer stream is synchronized,
92      to ensure messaging is complete, the MatMultAdd kernel
93      is launched in the original (MatMult) stream to protect
94      against race conditions.
95   */
96   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
97   PetscErrorCode ierr;
98   PetscInt       nt;
99 
100   PetscFunctionBegin;
101   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
102   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
103   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
104   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
105   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
107   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
108   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
113 {
114   /* This multiplication sequence is different sequence
115      than the CPU version. In particular, the diagonal block
116      multiplication kernel is launched in one stream. Then,
117      in a separate stream, the data transfers from DeviceToHost
118      (with MPI messaging in between), then HostToDevice are
119      launched. Once the data transfer stream is synchronized,
120      to ensure messaging is complete, the MatMultAdd kernel
121      is launched in the original (MatMult) stream to protect
122      against race conditions.
123 
124      This sequence should only be called for GPU computation. */
125   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
126   PetscErrorCode ierr;
127   PetscInt       nt;
128 
129   PetscFunctionBegin;
130   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
131   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
132   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
133   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
134   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
135   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
136   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
137   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
138   PetscFunctionReturn(0);
139 }
140 
141 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
142 {
143   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
144   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
145 
146   PetscFunctionBegin;
147   switch (op) {
148   case MAT_CUSPARSE_MULT_DIAG:
149     cusparseStruct->diagGPUMatFormat = format;
150     break;
151   case MAT_CUSPARSE_MULT_OFFDIAG:
152     cusparseStruct->offdiagGPUMatFormat = format;
153     break;
154   case MAT_CUSPARSE_ALL:
155     cusparseStruct->diagGPUMatFormat    = format;
156     cusparseStruct->offdiagGPUMatFormat = format;
157     break;
158   default:
159     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
160   }
161   PetscFunctionReturn(0);
162 }
163 
164 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
165 {
166   MatCUSPARSEStorageFormat format;
167   PetscErrorCode           ierr;
168   PetscBool                flg;
169   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
170   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
171 
172   PetscFunctionBegin;
173   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
174   if (A->factortype==MAT_FACTOR_NONE) {
175     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
176                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
177     if (flg) {
178       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
179     }
180     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
181                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182     if (flg) {
183       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
184     }
185     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
186                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187     if (flg) {
188       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189     }
190   }
191   ierr = PetscOptionsTail();CHKERRQ(ierr);
192   PetscFunctionReturn(0);
193 }
194 
195 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
196 {
197   PetscErrorCode ierr;
198   Mat_MPIAIJ     *mpiaij;
199 
200   PetscFunctionBegin;
201   mpiaij = (Mat_MPIAIJ*)A->data;
202   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
203   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
204     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
205   }
206   PetscFunctionReturn(0);
207 }
208 
209 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
210 {
211   PetscErrorCode     ierr;
212   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
213   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
214   cudaError_t        err;
215   cusparseStatus_t   stat;
216 
217   PetscFunctionBegin;
218   try {
219     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
220     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
221     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
222     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
223     delete cusparseStruct;
224   } catch(char *ex) {
225     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
226   }
227   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
232 {
233   PetscErrorCode     ierr;
234   Mat_MPIAIJ         *a;
235   Mat_MPIAIJCUSPARSE * cusparseStruct;
236   cudaError_t        err;
237   cusparseStatus_t   stat;
238 
239   PetscFunctionBegin;
240   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
241   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
242   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
243   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
244 
245   a        = (Mat_MPIAIJ*)A->data;
246   a->spptr = new Mat_MPIAIJCUSPARSE;
247 
248   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
249   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
250   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
251   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
252   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
253 
254   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
255   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
256   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
257   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
258   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
259   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
260 
261   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
262   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
263   PetscFunctionReturn(0);
264 }
265 
266 /*@
267    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
268    (the default parallel PETSc format).  This matrix will ultimately pushed down
269    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
270    assembly performance the user should preallocate the matrix storage by setting
271    the parameter nz (or the array nnz).  By setting these parameters accurately,
272    performance during matrix assembly can be increased by more than a factor of 50.
273 
274    Collective
275 
276    Input Parameters:
277 +  comm - MPI communicator, set to PETSC_COMM_SELF
278 .  m - number of rows
279 .  n - number of columns
280 .  nz - number of nonzeros per row (same for all rows)
281 -  nnz - array containing the number of nonzeros in the various rows
282          (possibly different for each row) or NULL
283 
284    Output Parameter:
285 .  A - the matrix
286 
287    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
288    MatXXXXSetPreallocation() paradigm instead of this routine directly.
289    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
290 
291    Notes:
292    If nnz is given then nz is ignored
293 
294    The AIJ format (also called the Yale sparse matrix format or
295    compressed row storage), is fully compatible with standard Fortran 77
296    storage.  That is, the stored row and column indices can begin at
297    either one (as in Fortran) or zero.  See the users' manual for details.
298 
299    Specify the preallocated storage with either nz or nnz (not both).
300    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
301    allocation.  For large problems you MUST preallocate memory or you
302    will get TERRIBLE performance, see the users' manual chapter on matrices.
303 
304    By default, this format uses inodes (identical nodes) when possible, to
305    improve numerical efficiency of matrix-vector products and solves. We
306    search for consecutive rows with the same nonzero structure, thereby
307    reusing matrix information to achieve increased efficiency.
308 
309    Level: intermediate
310 
311 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
312 @*/
313 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
314 {
315   PetscErrorCode ierr;
316   PetscMPIInt    size;
317 
318   PetscFunctionBegin;
319   ierr = MatCreate(comm,A);CHKERRQ(ierr);
320   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
321   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
322   if (size > 1) {
323     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
324     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
325   } else {
326     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
327     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
328   }
329   PetscFunctionReturn(0);
330 }
331 
332 /*MC
333    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
334 
335    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
336    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
337    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
338 
339    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
340    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
341    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
342    for communicators controlling multiple processes.  It is recommended that you call both of
343    the above preallocation routines for simplicity.
344 
345    Options Database Keys:
346 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
347 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
348 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
349 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
350 
351   Level: beginner
352 
353  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
354 M
355 M*/
356