xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 5b6bfdb9644f185dbf5e5a09b808ec241507e1e7)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8 {
9   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11   PetscErrorCode     ierr;
12   PetscInt           i;
13 
14   PetscFunctionBegin;
15   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
16   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
17   if (d_nnz) {
18     for (i=0; i<B->rmap->n; i++) {
19       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
20     }
21   }
22   if (o_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
25     }
26   }
27   if (!B->preallocated) {
28     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
29     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
30     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
31     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
33     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
35     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
37   }
38   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
39   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
40   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
41   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
42   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
46 
47   B->preallocated = PETSC_TRUE;
48   PetscFunctionReturn(0);
49 }
50 
51 PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
52 {
53   PetscErrorCode ierr;
54   PetscInt rbs,cbs;
55 
56   PetscFunctionBegin;
57   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
58   if (right) {
59     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
60     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
61     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
62     ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr);
63     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
64   }
65   if (left) {
66     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
67     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
68     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
69     ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr);
70     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
71 
72 
73   }
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
78 {
79   /* This multiplication sequence is different sequence
80      than the CPU version. In particular, the diagonal block
81      multiplication kernel is launched in one stream. Then,
82      in a separate stream, the data transfers from DeviceToHost
83      (with MPI messaging in between), then HostToDevice are
84      launched. Once the data transfer stream is synchronized,
85      to ensure messaging is complete, the MatMultAdd kernel
86      is launched in the original (MatMult) stream to protect
87      against race conditions.
88 
89      This sequence should only be called for GPU computation. */
90   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
91   PetscErrorCode ierr;
92   PetscInt       nt;
93 
94   PetscFunctionBegin;
95   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
96   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
97   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
98   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
99   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
100   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
101   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
102   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
107 {
108   /* This multiplication sequence is different sequence
109      than the CPU version. In particular, the diagonal block
110      multiplication kernel is launched in one stream. Then,
111      in a separate stream, the data transfers from DeviceToHost
112      (with MPI messaging in between), then HostToDevice are
113      launched. Once the data transfer stream is synchronized,
114      to ensure messaging is complete, the MatMultAdd kernel
115      is launched in the original (MatMult) stream to protect
116      against race conditions.
117 
118      This sequence should only be called for GPU computation. */
119   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
120   PetscErrorCode ierr;
121   PetscInt       nt;
122 
123   PetscFunctionBegin;
124   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
125   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
126   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr);
127   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
128   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
129   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
130   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
131   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
132   PetscFunctionReturn(0);
133 }
134 
135 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
136 {
137   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
138   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
139 
140   PetscFunctionBegin;
141   switch (op) {
142   case MAT_CUSPARSE_MULT_DIAG:
143     cusparseStruct->diagGPUMatFormat = format;
144     break;
145   case MAT_CUSPARSE_MULT_OFFDIAG:
146     cusparseStruct->offdiagGPUMatFormat = format;
147     break;
148   case MAT_CUSPARSE_ALL:
149     cusparseStruct->diagGPUMatFormat    = format;
150     cusparseStruct->offdiagGPUMatFormat = format;
151     break;
152   default:
153     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
154   }
155   PetscFunctionReturn(0);
156 }
157 
158 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
159 {
160   MatCUSPARSEStorageFormat format;
161   PetscErrorCode           ierr;
162   PetscBool                flg;
163   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
164   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
165 
166   PetscFunctionBegin;
167   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
168   ierr = PetscObjectOptionsBegin((PetscObject)A);
169   if (A->factortype==MAT_FACTOR_NONE) {
170     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
171                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
172     if (flg) {
173       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
174     }
175     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
176                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
177     if (flg) {
178       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
179     }
180     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
181                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182     if (flg) {
183       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
184     }
185   }
186   ierr = PetscOptionsEnd();CHKERRQ(ierr);
187   PetscFunctionReturn(0);
188 }
189 
190 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
191 {
192   PetscErrorCode ierr;
193   Mat_MPIAIJ     *mpiaij;
194 
195   PetscFunctionBegin;
196   mpiaij = (Mat_MPIAIJ*)A->data;
197   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
198   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
199     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
205 {
206   PetscErrorCode     ierr;
207   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
208   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
209   cudaError_t        err;
210   cusparseStatus_t   stat;
211 
212   PetscFunctionBegin;
213   try {
214     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
215     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
216     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
217     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
218     delete cusparseStruct;
219   } catch(char *ex) {
220     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
221   }
222   cusparseStruct = 0;
223 
224   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
229 {
230   PetscErrorCode     ierr;
231   Mat_MPIAIJ         *a;
232   Mat_MPIAIJCUSPARSE * cusparseStruct;
233   cudaError_t        err;
234   cusparseStatus_t   stat;
235 
236   PetscFunctionBegin;
237   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
238   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
239   a        = (Mat_MPIAIJ*)A->data;
240   a->spptr = new Mat_MPIAIJCUSPARSE;
241 
242   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
243   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
244   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
245   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
246   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
247 
248   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
249   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
250   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
251   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
252   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
253   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
254 
255   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
256   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
262    (the default parallel PETSc format).  This matrix will ultimately pushed down
263    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
264    assembly performance the user should preallocate the matrix storage by setting
265    the parameter nz (or the array nnz).  By setting these parameters accurately,
266    performance during matrix assembly can be increased by more than a factor of 50.
267 
268    Collective on MPI_Comm
269 
270    Input Parameters:
271 +  comm - MPI communicator, set to PETSC_COMM_SELF
272 .  m - number of rows
273 .  n - number of columns
274 .  nz - number of nonzeros per row (same for all rows)
275 -  nnz - array containing the number of nonzeros in the various rows
276          (possibly different for each row) or NULL
277 
278    Output Parameter:
279 .  A - the matrix
280 
281    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
282    MatXXXXSetPreallocation() paradigm instead of this routine directly.
283    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
284 
285    Notes:
286    If nnz is given then nz is ignored
287 
288    The AIJ format (also called the Yale sparse matrix format or
289    compressed row storage), is fully compatible with standard Fortran 77
290    storage.  That is, the stored row and column indices can begin at
291    either one (as in Fortran) or zero.  See the users' manual for details.
292 
293    Specify the preallocated storage with either nz or nnz (not both).
294    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
295    allocation.  For large problems you MUST preallocate memory or you
296    will get TERRIBLE performance, see the users' manual chapter on matrices.
297 
298    By default, this format uses inodes (identical nodes) when possible, to
299    improve numerical efficiency of matrix-vector products and solves. We
300    search for consecutive rows with the same nonzero structure, thereby
301    reusing matrix information to achieve increased efficiency.
302 
303    Level: intermediate
304 
305 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
306 @*/
307 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
308 {
309   PetscErrorCode ierr;
310   PetscMPIInt    size;
311 
312   PetscFunctionBegin;
313   ierr = MatCreate(comm,A);CHKERRQ(ierr);
314   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
315   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
316   if (size > 1) {
317     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
318     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
319   } else {
320     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
321     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
322   }
323   PetscFunctionReturn(0);
324 }
325 
326 /*MC
327    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
328 
329    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
330    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
331    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
332 
333    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
334    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
335    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
336    for communicators controlling multiple processes.  It is recommended that you call both of
337    the above preallocation routines for simplicity.
338 
339    Options Database Keys:
340 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
341 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
342 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
343 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
344 
345   Level: beginner
346 
347  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
348 M
349 M*/
350