xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2f6eced2a19e978d64f27de66fbc6c26cc5c7934)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8 {
9   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11   PetscErrorCode     ierr;
12   PetscInt           i;
13 
14   PetscFunctionBegin;
15   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
16   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
17   if (d_nnz) {
18     for (i=0; i<B->rmap->n; i++) {
19       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
20     }
21   }
22   if (o_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
25     }
26   }
27   if (!B->preallocated) {
28     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
29     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
30     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
31     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
33     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
35     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
37   }
38   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
39   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
40   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
41   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
42   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
46 
47   B->preallocated = PETSC_TRUE;
48   PetscFunctionReturn(0);
49 }
50 
51 PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
52 {
53   PetscErrorCode ierr;
54   PetscInt rbs,cbs;
55 
56   PetscFunctionBegin;
57   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
58   if (right) {
59     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
60     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
61     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
62     ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr);
63     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
64   }
65   if (left) {
66     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
67     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
68     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
69     ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr);
70     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
71 
72 
73   }
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
78 {
79   /* This multiplication sequence is different sequence
80      than the CPU version. In particular, the diagonal block
81      multiplication kernel is launched in one stream. Then,
82      in a separate stream, the data transfers from DeviceToHost
83      (with MPI messaging in between), then HostToDevice are
84      launched. Once the data transfer stream is synchronized,
85      to ensure messaging is complete, the MatMultAdd kernel
86      is launched in the original (MatMult) stream to protect
87      against race conditions.
88 
89      This sequence should only be called for GPU computation. */
90   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
91   PetscErrorCode ierr;
92   PetscInt       nt;
93 
94   PetscFunctionBegin;
95   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
96   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
97   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
98   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
99   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
100   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
101   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
102   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
107 {
108   /* This multiplication sequence is different sequence
109      than the CPU version. In particular, the diagonal block
110      multiplication kernel is launched in one stream. Then,
111      in a separate stream, the data transfers from DeviceToHost
112      (with MPI messaging in between), then HostToDevice are
113      launched. Once the data transfer stream is synchronized,
114      to ensure messaging is complete, the MatMultAdd kernel
115      is launched in the original (MatMult) stream to protect
116      against race conditions.
117 
118      This sequence should only be called for GPU computation. */
119   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
120   PetscErrorCode ierr;
121   PetscInt       nt;
122 
123   PetscFunctionBegin;
124   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
125   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
126   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
127   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
128   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
129   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
130   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
131   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
132   PetscFunctionReturn(0);
133 }
134 
135 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
136 {
137   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
138   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
139 
140   PetscFunctionBegin;
141   switch (op) {
142   case MAT_CUSPARSE_MULT_DIAG:
143     cusparseStruct->diagGPUMatFormat = format;
144     break;
145   case MAT_CUSPARSE_MULT_OFFDIAG:
146     cusparseStruct->offdiagGPUMatFormat = format;
147     break;
148   case MAT_CUSPARSE_ALL:
149     cusparseStruct->diagGPUMatFormat    = format;
150     cusparseStruct->offdiagGPUMatFormat = format;
151     break;
152   default:
153     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
154   }
155   PetscFunctionReturn(0);
156 }
157 
158 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
159 {
160   MatCUSPARSEStorageFormat format;
161   PetscErrorCode           ierr;
162   PetscBool                flg;
163   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
164   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
165 
166   PetscFunctionBegin;
167   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
168   ierr = PetscObjectOptionsBegin((PetscObject)A);
169   if (A->factortype==MAT_FACTOR_NONE) {
170     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
171                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
172     if (flg) {
173       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
174     }
175     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
176                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
177     if (flg) {
178       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
179     }
180     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
181                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182     if (flg) {
183       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
184     }
185   }
186   ierr = PetscOptionsEnd();CHKERRQ(ierr);
187   PetscFunctionReturn(0);
188 }
189 
190 PETSC_EXTERN PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat,MatAssemblyType);
191 
192 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
193 {
194   PetscErrorCode ierr;
195   Mat_MPIAIJ     *mpiaij;
196 
197   PetscFunctionBegin;
198   mpiaij = (Mat_MPIAIJ*)A->data;
199   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
200   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
201     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
202   }
203   PetscFunctionReturn(0);
204 }
205 
206 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
207 {
208   PetscErrorCode     ierr;
209   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
210   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
211   cudaError_t        err;
212   cusparseStatus_t   stat;
213 
214   PetscFunctionBegin;
215   try {
216     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
217     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
218     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
219     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
220     delete cusparseStruct;
221   } catch(char *ex) {
222     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
223   }
224   cusparseStruct = 0;
225 
226   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
231 {
232   PetscErrorCode     ierr;
233   Mat_MPIAIJ         *a;
234   Mat_MPIAIJCUSPARSE * cusparseStruct;
235   cudaError_t        err;
236   cusparseStatus_t   stat;
237 
238   PetscFunctionBegin;
239   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
240   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
241   a        = (Mat_MPIAIJ*)A->data;
242   a->spptr = new Mat_MPIAIJCUSPARSE;
243 
244   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
245   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
246   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
247   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
248   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
249 
250   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
251   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
252   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
253   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
254   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
255   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
256 
257   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
258   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
259   PetscFunctionReturn(0);
260 }
261 
262 /*@
263    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
264    (the default parallel PETSc format).  This matrix will ultimately pushed down
265    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
266    assembly performance the user should preallocate the matrix storage by setting
267    the parameter nz (or the array nnz).  By setting these parameters accurately,
268    performance during matrix assembly can be increased by more than a factor of 50.
269 
270    Collective on MPI_Comm
271 
272    Input Parameters:
273 +  comm - MPI communicator, set to PETSC_COMM_SELF
274 .  m - number of rows
275 .  n - number of columns
276 .  nz - number of nonzeros per row (same for all rows)
277 -  nnz - array containing the number of nonzeros in the various rows
278          (possibly different for each row) or NULL
279 
280    Output Parameter:
281 .  A - the matrix
282 
283    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
284    MatXXXXSetPreallocation() paradigm instead of this routine directly.
285    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
286 
287    Notes:
288    If nnz is given then nz is ignored
289 
290    The AIJ format (also called the Yale sparse matrix format or
291    compressed row storage), is fully compatible with standard Fortran 77
292    storage.  That is, the stored row and column indices can begin at
293    either one (as in Fortran) or zero.  See the users' manual for details.
294 
295    Specify the preallocated storage with either nz or nnz (not both).
296    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
297    allocation.  For large problems you MUST preallocate memory or you
298    will get TERRIBLE performance, see the users' manual chapter on matrices.
299 
300    By default, this format uses inodes (identical nodes) when possible, to
301    improve numerical efficiency of matrix-vector products and solves. We
302    search for consecutive rows with the same nonzero structure, thereby
303    reusing matrix information to achieve increased efficiency.
304 
305    Level: intermediate
306 
307 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
308 @*/
309 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
310 {
311   PetscErrorCode ierr;
312   PetscMPIInt    size;
313 
314   PetscFunctionBegin;
315   ierr = MatCreate(comm,A);CHKERRQ(ierr);
316   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
317   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
318   if (size > 1) {
319     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
320     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
321   } else {
322     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
323     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
324   }
325   PetscFunctionReturn(0);
326 }
327 
328 /*M
329    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
330 
331    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
332    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
333    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
334 
335    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
336    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
337    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
338    for communicators controlling multiple processes.  It is recommended that you call both of
339    the above preallocation routines for simplicity.
340 
341    Options Database Keys:
342 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
343 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
344 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
345 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
346 
347   Level: beginner
348 
349  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
350 M
351 M*/
352