xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 63f3c55c12ae2f190c391f6df6c540efe018a44a)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode     ierr;
14   PetscInt           i;
15 
16   PetscFunctionBegin;
17   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
18   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
19   if (d_nnz) {
20     for (i=0; i<B->rmap->n; i++) {
21       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
22     }
23   }
24   if (o_nnz) {
25     for (i=0; i<B->rmap->n; i++) {
26       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
27     }
28   }
29   if (!B->preallocated) {
30     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
31     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
32     ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr);
33     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
34     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
35     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
36     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
37     ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr);
38     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
39     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
40     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
41   }
42   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
43   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
50 
51   B->preallocated = PETSC_TRUE;
52   PetscFunctionReturn(0);
53 }
54 
55 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
56 {
57   /*
58      This multiplication sequence is different sequence
59      than the CPU version. In particular, the diagonal block
60      multiplication kernel is launched in one stream. Then,
61      in a separate stream, the data transfers from DeviceToHost
62      (with MPI messaging in between), then HostToDevice are
63      launched. Once the data transfer stream is synchronized,
64      to ensure messaging is complete, the MatMultAdd kernel
65      is launched in the original (MatMult) stream to protect
66      against race conditions.
67   */
68   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
69   PetscErrorCode ierr;
70   PetscInt       nt;
71 
72   PetscFunctionBegin;
73   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
74   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
75   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
76   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
77   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
78   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
79   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
80   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
81   PetscFunctionReturn(0);
82 }
83 
84 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
85 {
86   /*
87      This multiplication sequence is different sequence
88      than the CPU version. In particular, the diagonal block
89      multiplication kernel is launched in one stream. Then,
90      in a separate stream, the data transfers from DeviceToHost
91      (with MPI messaging in between), then HostToDevice are
92      launched. Once the data transfer stream is synchronized,
93      to ensure messaging is complete, the MatMultAdd kernel
94      is launched in the original (MatMult) stream to protect
95      against race conditions.
96   */
97   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
98   PetscErrorCode ierr;
99   PetscInt       nt;
100 
101   PetscFunctionBegin;
102   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
103   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
104   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
105   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
106   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
107   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
109   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
110   PetscFunctionReturn(0);
111 }
112 
113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
114 {
115   /* This multiplication sequence is different sequence
116      than the CPU version. In particular, the diagonal block
117      multiplication kernel is launched in one stream. Then,
118      in a separate stream, the data transfers from DeviceToHost
119      (with MPI messaging in between), then HostToDevice are
120      launched. Once the data transfer stream is synchronized,
121      to ensure messaging is complete, the MatMultAdd kernel
122      is launched in the original (MatMult) stream to protect
123      against race conditions.
124 
125      This sequence should only be called for GPU computation. */
126   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
127   PetscErrorCode ierr;
128   PetscInt       nt;
129 
130   PetscFunctionBegin;
131   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
132   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
133   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
134   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
135   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
136   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
137   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
138   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
139   PetscFunctionReturn(0);
140 }
141 
142 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
143 {
144   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
145   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
146 
147   PetscFunctionBegin;
148   switch (op) {
149   case MAT_CUSPARSE_MULT_DIAG:
150     cusparseStruct->diagGPUMatFormat = format;
151     break;
152   case MAT_CUSPARSE_MULT_OFFDIAG:
153     cusparseStruct->offdiagGPUMatFormat = format;
154     break;
155   case MAT_CUSPARSE_ALL:
156     cusparseStruct->diagGPUMatFormat    = format;
157     cusparseStruct->offdiagGPUMatFormat = format;
158     break;
159   default:
160     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
161   }
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
166 {
167   MatCUSPARSEStorageFormat format;
168   PetscErrorCode           ierr;
169   PetscBool                flg;
170   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
171   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
172 
173   PetscFunctionBegin;
174   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
175   if (A->factortype==MAT_FACTOR_NONE) {
176     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
177                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
178     if (flg) {
179       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
180     }
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
185     }
186     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
187                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
188     if (flg) {
189       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
190     }
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 }
195 
196 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
197 {
198   PetscErrorCode ierr;
199   Mat_MPIAIJ     *mpiaij;
200 
201   PetscFunctionBegin;
202   mpiaij = (Mat_MPIAIJ*)A->data;
203   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
204   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
205     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
206   }
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
211 {
212   PetscErrorCode     ierr;
213   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
214   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
215   cudaError_t        err;
216   cusparseStatus_t   stat;
217 
218   PetscFunctionBegin;
219   try {
220     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
221     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
222     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
223     if (cusparseStruct->stream) {
224       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
225     }
226     delete cusparseStruct;
227   } catch(char *ex) {
228     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
229   }
230   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
231   PetscFunctionReturn(0);
232 }
233 
234 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
235 {
236   PetscErrorCode     ierr;
237   Mat_MPIAIJ         *a;
238   Mat_MPIAIJCUSPARSE * cusparseStruct;
239   cusparseStatus_t   stat;
240 
241   PetscFunctionBegin;
242   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
243   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
244   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
245   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
246 
247   a        = (Mat_MPIAIJ*)A->data;
248   a->spptr = new Mat_MPIAIJCUSPARSE;
249 
250   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
251   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
252   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
253   cusparseStruct->stream              = 0;
254   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
255 
256   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
257   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
258   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
259   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
260   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
261   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
262 
263   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
264   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
265   PetscFunctionReturn(0);
266 }
267 
268 /*@
269    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
270    (the default parallel PETSc format).  This matrix will ultimately pushed down
271    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
272    assembly performance the user should preallocate the matrix storage by setting
273    the parameter nz (or the array nnz).  By setting these parameters accurately,
274    performance during matrix assembly can be increased by more than a factor of 50.
275 
276    Collective
277 
278    Input Parameters:
279 +  comm - MPI communicator, set to PETSC_COMM_SELF
280 .  m - number of rows
281 .  n - number of columns
282 .  nz - number of nonzeros per row (same for all rows)
283 -  nnz - array containing the number of nonzeros in the various rows
284          (possibly different for each row) or NULL
285 
286    Output Parameter:
287 .  A - the matrix
288 
289    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
290    MatXXXXSetPreallocation() paradigm instead of this routine directly.
291    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
292 
293    Notes:
294    If nnz is given then nz is ignored
295 
296    The AIJ format (also called the Yale sparse matrix format or
297    compressed row storage), is fully compatible with standard Fortran 77
298    storage.  That is, the stored row and column indices can begin at
299    either one (as in Fortran) or zero.  See the users' manual for details.
300 
301    Specify the preallocated storage with either nz or nnz (not both).
302    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
303    allocation.  For large problems you MUST preallocate memory or you
304    will get TERRIBLE performance, see the users' manual chapter on matrices.
305 
306    By default, this format uses inodes (identical nodes) when possible, to
307    improve numerical efficiency of matrix-vector products and solves. We
308    search for consecutive rows with the same nonzero structure, thereby
309    reusing matrix information to achieve increased efficiency.
310 
311    Level: intermediate
312 
313 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
314 @*/
315 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
316 {
317   PetscErrorCode ierr;
318   PetscMPIInt    size;
319 
320   PetscFunctionBegin;
321   ierr = MatCreate(comm,A);CHKERRQ(ierr);
322   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
323   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
324   if (size > 1) {
325     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
326     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
327   } else {
328     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
329     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
330   }
331   PetscFunctionReturn(0);
332 }
333 
334 /*MC
335    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
336 
337    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
338    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
339    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
340 
341    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
342    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
343    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
344    for communicators controlling multiple processes.  It is recommended that you call both of
345    the above preallocation routines for simplicity.
346 
347    Options Database Keys:
348 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
349 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
350 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
351 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
352 
353   Level: beginner
354 
355  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
356 M
357 M*/
358