xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision c4762a1b19cd2af06abeed90e8f9d34fb975dd94)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
19   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
20   if (d_nnz) {
21     for (i=0; i<B->rmap->n; i++) {
22       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23     }
24   }
25   if (o_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28     }
29   }
30   if (!B->preallocated) {
31     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
35     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
37     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
38     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
40     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
42   }
43   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
44   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
51 
52   B->preallocated = PETSC_TRUE;
53   PetscFunctionReturn(0);
54 }
55 
56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
57 {
58   /*
59      This multiplication sequence is different sequence
60      than the CPU version. In particular, the diagonal block
61      multiplication kernel is launched in one stream. Then,
62      in a separate stream, the data transfers from DeviceToHost
63      (with MPI messaging in between), then HostToDevice are
64      launched. Once the data transfer stream is synchronized,
65      to ensure messaging is complete, the MatMultAdd kernel
66      is launched in the original (MatMult) stream to protect
67      against race conditions.
68   */
69   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
70   PetscErrorCode ierr;
71   PetscInt       nt;
72 
73   PetscFunctionBegin;
74   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
75   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
76   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
77   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
78   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
79   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
80   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
81   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
82   PetscFunctionReturn(0);
83 }
84 
85 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
86 {
87   /*
88      This multiplication sequence is different sequence
89      than the CPU version. In particular, the diagonal block
90      multiplication kernel is launched in one stream. Then,
91      in a separate stream, the data transfers from DeviceToHost
92      (with MPI messaging in between), then HostToDevice are
93      launched. Once the data transfer stream is synchronized,
94      to ensure messaging is complete, the MatMultAdd kernel
95      is launched in the original (MatMult) stream to protect
96      against race conditions.
97   */
98   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
99   PetscErrorCode ierr;
100   PetscInt       nt;
101 
102   PetscFunctionBegin;
103   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
104   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
105   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
106   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
107   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
109   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
110   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
111   PetscFunctionReturn(0);
112 }
113 
114 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
115 {
116   /* This multiplication sequence is different sequence
117      than the CPU version. In particular, the diagonal block
118      multiplication kernel is launched in one stream. Then,
119      in a separate stream, the data transfers from DeviceToHost
120      (with MPI messaging in between), then HostToDevice are
121      launched. Once the data transfer stream is synchronized,
122      to ensure messaging is complete, the MatMultAdd kernel
123      is launched in the original (MatMult) stream to protect
124      against race conditions.
125 
126      This sequence should only be called for GPU computation. */
127   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
128   PetscErrorCode ierr;
129   PetscInt       nt;
130 
131   PetscFunctionBegin;
132   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
133   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
134   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
135   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
136   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
137   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
138   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
139   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
140   PetscFunctionReturn(0);
141 }
142 
143 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
144 {
145   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
146   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
147 
148   PetscFunctionBegin;
149   switch (op) {
150   case MAT_CUSPARSE_MULT_DIAG:
151     cusparseStruct->diagGPUMatFormat = format;
152     break;
153   case MAT_CUSPARSE_MULT_OFFDIAG:
154     cusparseStruct->offdiagGPUMatFormat = format;
155     break;
156   case MAT_CUSPARSE_ALL:
157     cusparseStruct->diagGPUMatFormat    = format;
158     cusparseStruct->offdiagGPUMatFormat = format;
159     break;
160   default:
161     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
162   }
163   PetscFunctionReturn(0);
164 }
165 
166 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
167 {
168   MatCUSPARSEStorageFormat format;
169   PetscErrorCode           ierr;
170   PetscBool                flg;
171   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
172   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
173 
174   PetscFunctionBegin;
175   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
176   if (A->factortype==MAT_FACTOR_NONE) {
177     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
178                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179     if (flg) {
180       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
181     }
182     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
183                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
184     if (flg) {
185       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
186     }
187     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
188                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189     if (flg) {
190       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191     }
192   }
193   ierr = PetscOptionsTail();CHKERRQ(ierr);
194   PetscFunctionReturn(0);
195 }
196 
197 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
198 {
199   PetscErrorCode ierr;
200   Mat_MPIAIJ     *mpiaij;
201 
202   PetscFunctionBegin;
203   mpiaij = (Mat_MPIAIJ*)A->data;
204   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
205   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
206     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
207   }
208   PetscFunctionReturn(0);
209 }
210 
211 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
212 {
213   PetscErrorCode     ierr;
214   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
215   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
216   cudaError_t        err;
217   cusparseStatus_t   stat;
218 
219   PetscFunctionBegin;
220   try {
221     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
222     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
223     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
224     if (cusparseStruct->stream) {
225       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
226     }
227     delete cusparseStruct;
228   } catch(char *ex) {
229     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
230   }
231   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
236 {
237   PetscErrorCode     ierr;
238   Mat_MPIAIJ         *a;
239   Mat_MPIAIJCUSPARSE * cusparseStruct;
240   cusparseStatus_t   stat;
241 
242   PetscFunctionBegin;
243   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
244   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
245   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
246   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
247 
248   a        = (Mat_MPIAIJ*)A->data;
249   a->spptr = new Mat_MPIAIJCUSPARSE;
250 
251   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
252   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
253   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
254   cusparseStruct->stream              = 0;
255   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
256 
257   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
258   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
259   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
260   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
261   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
262   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
263 
264   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
265   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
266   PetscFunctionReturn(0);
267 }
268 
269 /*@
270    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
271    (the default parallel PETSc format).  This matrix will ultimately pushed down
272    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
273    assembly performance the user should preallocate the matrix storage by setting
274    the parameter nz (or the array nnz).  By setting these parameters accurately,
275    performance during matrix assembly can be increased by more than a factor of 50.
276 
277    Collective
278 
279    Input Parameters:
280 +  comm - MPI communicator, set to PETSC_COMM_SELF
281 .  m - number of rows
282 .  n - number of columns
283 .  nz - number of nonzeros per row (same for all rows)
284 -  nnz - array containing the number of nonzeros in the various rows
285          (possibly different for each row) or NULL
286 
287    Output Parameter:
288 .  A - the matrix
289 
290    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
291    MatXXXXSetPreallocation() paradigm instead of this routine directly.
292    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
293 
294    Notes:
295    If nnz is given then nz is ignored
296 
297    The AIJ format (also called the Yale sparse matrix format or
298    compressed row storage), is fully compatible with standard Fortran 77
299    storage.  That is, the stored row and column indices can begin at
300    either one (as in Fortran) or zero.  See the users' manual for details.
301 
302    Specify the preallocated storage with either nz or nnz (not both).
303    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
304    allocation.  For large problems you MUST preallocate memory or you
305    will get TERRIBLE performance, see the users' manual chapter on matrices.
306 
307    By default, this format uses inodes (identical nodes) when possible, to
308    improve numerical efficiency of matrix-vector products and solves. We
309    search for consecutive rows with the same nonzero structure, thereby
310    reusing matrix information to achieve increased efficiency.
311 
312    Level: intermediate
313 
314 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
315 @*/
316 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
317 {
318   PetscErrorCode ierr;
319   PetscMPIInt    size;
320 
321   PetscFunctionBegin;
322   ierr = MatCreate(comm,A);CHKERRQ(ierr);
323   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
324   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
325   if (size > 1) {
326     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
327     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
328   } else {
329     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
330     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
331   }
332   PetscFunctionReturn(0);
333 }
334 
335 /*MC
336    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
337 
338    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
339    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
340    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
341 
342    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
343    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
344    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
345    for communicators controlling multiple processes.  It is recommended that you call both of
346    the above preallocation routines for simplicity.
347 
348    Options Database Keys:
349 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
350 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
351 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
352 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
353 
354   Level: beginner
355 
356  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
357 M
358 M*/
359