xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision ae83a963cf345a22bbe51889cc24f30a34b18808)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode     ierr;
14   PetscInt           i;
15 
16   PetscFunctionBegin;
17   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
18   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
19   if (d_nnz) {
20     for (i=0; i<B->rmap->n; i++) {
21       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
22     }
23   }
24   if (o_nnz) {
25     for (i=0; i<B->rmap->n; i++) {
26       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
27     }
28   }
29   if (!B->preallocated) {
30     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
31     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
32     ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr);
33     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
34     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
35     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
36     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
37     ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr);
38     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
39     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
40     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
41   }
42   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
43   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
50 
51   B->preallocated = PETSC_TRUE;
52   PetscFunctionReturn(0);
53 }
54 
55 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
56 {
57   /*
58      This multiplication sequence is different sequence
59      than the CPU version. In particular, the diagonal block
60      multiplication kernel is launched in one stream. Then,
61      in a separate stream, the data transfers from DeviceToHost
62      (with MPI messaging in between), then HostToDevice are
63      launched. Once the data transfer stream is synchronized,
64      to ensure messaging is complete, the MatMultAdd kernel
65      is launched in the original (MatMult) stream to protect
66      against race conditions.
67   */
68   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
69   PetscErrorCode ierr;
70   PetscInt       nt;
71 
72   PetscFunctionBegin;
73   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
74   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
75   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
76   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
77   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
78   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
79   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
80   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
81   PetscFunctionReturn(0);
82 }
83 
84 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
85 {
86   /*
87      This multiplication sequence is different sequence
88      than the CPU version. In particular, the diagonal block
89      multiplication kernel is launched in one stream. Then,
90      in a separate stream, the data transfers from DeviceToHost
91      (with MPI messaging in between), then HostToDevice are
92      launched. Once the data transfer stream is synchronized,
93      to ensure messaging is complete, the MatMultAdd kernel
94      is launched in the original (MatMult) stream to protect
95      against race conditions.
96   */
97   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
98   PetscErrorCode ierr;
99   PetscInt       nt;
100 
101   PetscFunctionBegin;
102   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
103   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
104   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
105   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
106   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
107   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
109   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
110   PetscFunctionReturn(0);
111 }
112 
113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
114 {
115   /* This multiplication sequence is different sequence
116      than the CPU version. In particular, the diagonal block
117      multiplication kernel is launched in one stream. Then,
118      in a separate stream, the data transfers from DeviceToHost
119      (with MPI messaging in between), then HostToDevice are
120      launched. Once the data transfer stream is synchronized,
121      to ensure messaging is complete, the MatMultAdd kernel
122      is launched in the original (MatMult) stream to protect
123      against race conditions.
124 
125      This sequence should only be called for GPU computation. */
126   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
127   PetscErrorCode ierr;
128   PetscInt       nt;
129 
130   PetscFunctionBegin;
131   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
132   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
133   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
134   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
135   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
136   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
137   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
138   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
139   PetscFunctionReturn(0);
140 }
141 
142 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
143 {
144   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
145   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
146 
147   PetscFunctionBegin;
148   switch (op) {
149   case MAT_CUSPARSE_MULT_DIAG:
150     cusparseStruct->diagGPUMatFormat = format;
151     break;
152   case MAT_CUSPARSE_MULT_OFFDIAG:
153     cusparseStruct->offdiagGPUMatFormat = format;
154     break;
155   case MAT_CUSPARSE_ALL:
156     cusparseStruct->diagGPUMatFormat    = format;
157     cusparseStruct->offdiagGPUMatFormat = format;
158     break;
159   default:
160     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
161   }
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
166 {
167   MatCUSPARSEStorageFormat format;
168   PetscErrorCode           ierr;
169   PetscBool                flg;
170   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
171   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
172 
173   PetscFunctionBegin;
174   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
175   if (A->factortype==MAT_FACTOR_NONE) {
176     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
177                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
178     if (flg) {
179       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
180     }
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
185     }
186     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
187                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
188     if (flg) {
189       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
190     }
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 }
195 
196 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
197 {
198   PetscErrorCode ierr;
199   Mat_MPIAIJ     *mpiaij;
200 
201   PetscFunctionBegin;
202   mpiaij = (Mat_MPIAIJ*)A->data;
203   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
204   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
205     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
206   }
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
211 {
212   PetscErrorCode     ierr;
213   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
214   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
215   cudaError_t        err;
216   cusparseStatus_t   stat;
217 
218   PetscFunctionBegin;
219   try {
220     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
221     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
222     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
223     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
224     delete cusparseStruct;
225   } catch(char *ex) {
226     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
227   }
228   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
233 {
234   PetscErrorCode     ierr;
235   Mat_MPIAIJ         *a;
236   Mat_MPIAIJCUSPARSE * cusparseStruct;
237   cudaError_t        err;
238   cusparseStatus_t   stat;
239 
240   PetscFunctionBegin;
241   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
242   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
243   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
244   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
245 
246   a        = (Mat_MPIAIJ*)A->data;
247   a->spptr = new Mat_MPIAIJCUSPARSE;
248 
249   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
250   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
251   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
252   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
253   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
254 
255   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
256   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
257   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
258   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
259   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
260   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
261 
262   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
263   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
264   PetscFunctionReturn(0);
265 }
266 
267 /*@
268    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
269    (the default parallel PETSc format).  This matrix will ultimately pushed down
270    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
271    assembly performance the user should preallocate the matrix storage by setting
272    the parameter nz (or the array nnz).  By setting these parameters accurately,
273    performance during matrix assembly can be increased by more than a factor of 50.
274 
275    Collective
276 
277    Input Parameters:
278 +  comm - MPI communicator, set to PETSC_COMM_SELF
279 .  m - number of rows
280 .  n - number of columns
281 .  nz - number of nonzeros per row (same for all rows)
282 -  nnz - array containing the number of nonzeros in the various rows
283          (possibly different for each row) or NULL
284 
285    Output Parameter:
286 .  A - the matrix
287 
288    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
289    MatXXXXSetPreallocation() paradigm instead of this routine directly.
290    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
291 
292    Notes:
293    If nnz is given then nz is ignored
294 
295    The AIJ format (also called the Yale sparse matrix format or
296    compressed row storage), is fully compatible with standard Fortran 77
297    storage.  That is, the stored row and column indices can begin at
298    either one (as in Fortran) or zero.  See the users' manual for details.
299 
300    Specify the preallocated storage with either nz or nnz (not both).
301    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
302    allocation.  For large problems you MUST preallocate memory or you
303    will get TERRIBLE performance, see the users' manual chapter on matrices.
304 
305    By default, this format uses inodes (identical nodes) when possible, to
306    improve numerical efficiency of matrix-vector products and solves. We
307    search for consecutive rows with the same nonzero structure, thereby
308    reusing matrix information to achieve increased efficiency.
309 
310    Level: intermediate
311 
312 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
313 @*/
314 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
315 {
316   PetscErrorCode ierr;
317   PetscMPIInt    size;
318 
319   PetscFunctionBegin;
320   ierr = MatCreate(comm,A);CHKERRQ(ierr);
321   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
322   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
323   if (size > 1) {
324     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
325     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
326   } else {
327     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
328     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
329   }
330   PetscFunctionReturn(0);
331 }
332 
333 /*MC
334    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
335 
336    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
337    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
338    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
339 
340    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
341    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
342    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
343    for communicators controlling multiple processes.  It is recommended that you call both of
344    the above preallocation routines for simplicity.
345 
346    Options Database Keys:
347 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
348 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
349 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
350 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
351 
352   Level: beginner
353 
354  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
355 M
356 M*/
357