xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision fbc6a628712545ee987ca46dddc97dc451937281)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8 {
9   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11   PetscErrorCode     ierr;
12   PetscInt           i;
13 
14   PetscFunctionBegin;
15   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
16   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
17   if (d_nnz) {
18     for (i=0; i<B->rmap->n; i++) {
19       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
20     }
21   }
22   if (o_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
25     }
26   }
27   if (!B->preallocated) {
28     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
29     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
30     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
31     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
33     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
35     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
37   }
38   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
39   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
40   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
41   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
42   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
46 
47   B->preallocated = PETSC_TRUE;
48   PetscFunctionReturn(0);
49 }
50 
51 PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
52 {
53   PetscErrorCode ierr;
54   PetscInt rbs,cbs;
55 
56   PetscFunctionBegin;
57   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
58   if (right) {
59     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
60     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
61     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
62     ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr);
63     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
64   }
65   if (left) {
66     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
67     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
68     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
69     ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr);
70     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
71 
72 
73   }
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
78 {
79   /* This multiplication sequence is different sequence
80      than the CPU version. In particular, the diagonal block
81      multiplication kernel is launched in one stream. Then,
82      in a separate stream, the data transfers from DeviceToHost
83      (with MPI messaging in between), then HostToDevice are
84      launched. Once the data transfer stream is synchronized,
85      to ensure messaging is complete, the MatMultAdd kernel
86      is launched in the original (MatMult) stream to protect
87      against race conditions.
88 
89      This sequence should only be called for GPU computation. */
90   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
91   PetscErrorCode ierr;
92   PetscInt       nt;
93 
94   PetscFunctionBegin;
95   PetscCheckTypeNames(xx,VECSEQCUDA,VECMPICUDA);
96   PetscCheckTypeNames(yy,VECSEQCUDA,VECMPICUDA);
97   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
98   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
99   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
100   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
101   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
102   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
104   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
109 {
110   /* This multiplication sequence is different sequence
111      than the CPU version. In particular, the diagonal block
112      multiplication kernel is launched in one stream. Then,
113      in a separate stream, the data transfers from DeviceToHost
114      (with MPI messaging in between), then HostToDevice are
115      launched. Once the data transfer stream is synchronized,
116      to ensure messaging is complete, the MatMultAdd kernel
117      is launched in the original (MatMult) stream to protect
118      against race conditions.
119 
120      This sequence should only be called for GPU computation. */
121   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
122   PetscErrorCode ierr;
123   PetscInt       nt;
124 
125   PetscFunctionBegin;
126   PetscCheckTypeNames(xx,VECSEQCUDA,VECMPICUDA);
127   PetscCheckTypeNames(yy,VECSEQCUDA,VECMPICUDA);
128   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
129   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
130   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
131   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
132   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
133   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
134   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
135   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
136   PetscFunctionReturn(0);
137 }
138 
139 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
140 {
141   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
142   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
143 
144   PetscFunctionBegin;
145   switch (op) {
146   case MAT_CUSPARSE_MULT_DIAG:
147     cusparseStruct->diagGPUMatFormat = format;
148     break;
149   case MAT_CUSPARSE_MULT_OFFDIAG:
150     cusparseStruct->offdiagGPUMatFormat = format;
151     break;
152   case MAT_CUSPARSE_ALL:
153     cusparseStruct->diagGPUMatFormat    = format;
154     cusparseStruct->offdiagGPUMatFormat = format;
155     break;
156   default:
157     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
158   }
159   PetscFunctionReturn(0);
160 }
161 
162 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
163 {
164   MatCUSPARSEStorageFormat format;
165   PetscErrorCode           ierr;
166   PetscBool                flg;
167   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
168   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
169 
170   PetscFunctionBegin;
171   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
172   ierr = PetscObjectOptionsBegin((PetscObject)A);
173   if (A->factortype==MAT_FACTOR_NONE) {
174     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
175                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
176     if (flg) {
177       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
178     }
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
183     }
184     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
185                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186     if (flg) {
187       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
188     }
189   }
190   ierr = PetscOptionsEnd();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 }
193 
194 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
195 {
196   PetscErrorCode ierr;
197   Mat_MPIAIJ     *mpiaij;
198 
199   PetscFunctionBegin;
200   mpiaij = (Mat_MPIAIJ*)A->data;
201   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
202   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
203     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
204   }
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
209 {
210   PetscErrorCode     ierr;
211   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
212   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
213   cudaError_t        err;
214   cusparseStatus_t   stat;
215 
216   PetscFunctionBegin;
217   try {
218     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
219     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
220     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
221     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
222     delete cusparseStruct;
223   } catch(char *ex) {
224     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
225   }
226   cusparseStruct = 0;
227 
228   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
233 {
234   PetscErrorCode     ierr;
235   Mat_MPIAIJ         *a;
236   Mat_MPIAIJCUSPARSE * cusparseStruct;
237   cudaError_t        err;
238   cusparseStatus_t   stat;
239 
240   PetscFunctionBegin;
241   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
242   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
243   a        = (Mat_MPIAIJ*)A->data;
244   a->spptr = new Mat_MPIAIJCUSPARSE;
245 
246   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
247   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
248   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
249   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
250   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
251 
252   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
253   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
254   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
255   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
256   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
257   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
258 
259   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
260   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 /*@
265    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
266    (the default parallel PETSc format).  This matrix will ultimately pushed down
267    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
268    assembly performance the user should preallocate the matrix storage by setting
269    the parameter nz (or the array nnz).  By setting these parameters accurately,
270    performance during matrix assembly can be increased by more than a factor of 50.
271 
272    Collective on MPI_Comm
273 
274    Input Parameters:
275 +  comm - MPI communicator, set to PETSC_COMM_SELF
276 .  m - number of rows
277 .  n - number of columns
278 .  nz - number of nonzeros per row (same for all rows)
279 -  nnz - array containing the number of nonzeros in the various rows
280          (possibly different for each row) or NULL
281 
282    Output Parameter:
283 .  A - the matrix
284 
285    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
286    MatXXXXSetPreallocation() paradigm instead of this routine directly.
287    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
288 
289    Notes:
290    If nnz is given then nz is ignored
291 
292    The AIJ format (also called the Yale sparse matrix format or
293    compressed row storage), is fully compatible with standard Fortran 77
294    storage.  That is, the stored row and column indices can begin at
295    either one (as in Fortran) or zero.  See the users' manual for details.
296 
297    Specify the preallocated storage with either nz or nnz (not both).
298    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
299    allocation.  For large problems you MUST preallocate memory or you
300    will get TERRIBLE performance, see the users' manual chapter on matrices.
301 
302    By default, this format uses inodes (identical nodes) when possible, to
303    improve numerical efficiency of matrix-vector products and solves. We
304    search for consecutive rows with the same nonzero structure, thereby
305    reusing matrix information to achieve increased efficiency.
306 
307    Level: intermediate
308 
309 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
310 @*/
311 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
312 {
313   PetscErrorCode ierr;
314   PetscMPIInt    size;
315 
316   PetscFunctionBegin;
317   ierr = MatCreate(comm,A);CHKERRQ(ierr);
318   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
319   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
320   if (size > 1) {
321     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
322     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
323   } else {
324     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
325     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
326   }
327   PetscFunctionReturn(0);
328 }
329 
330 /*M
331    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
332 
333    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
334    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
335    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
336 
337    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
338    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
339    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
340    for communicators controlling multiple processes.  It is recommended that you call both of
341    the above preallocation routines for simplicity.
342 
343    Options Database Keys:
344 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
345 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
346 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
347 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
348 
349   Level: beginner
350 
351  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
352 M
353 M*/
354