xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 5fd668637986a8d8518383a9159eebc368e1d5b4)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode ierr;
14   PetscInt       i;
15 
16   PetscFunctionBegin;
17   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
18   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
19   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
20   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
21 
22   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
23   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
26   if (d_nnz) {
27     for (i=0; i<B->rmap->n; i++) {
28       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29     }
30   }
31   if (o_nnz) {
32     for (i=0; i<B->rmap->n; i++) {
33       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34     }
35   }
36   if (!B->preallocated) {
37     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
40     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
42     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
43     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
44     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
45     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
46   }
47   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
48   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49   ierr=MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50   ierr=MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51   B->preallocated = PETSC_TRUE;
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNCT__
56 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
57 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
58 {
59   PetscErrorCode ierr;
60 
61   PetscFunctionBegin;
62   if (right) {
63     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
64     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
65     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
66     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
67     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
68   }
69   if (left) {
70     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
71     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
72     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
73     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
74     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
75   }
76   PetscFunctionReturn(0);
77 }
78 
79 
80 #undef __FUNCT__
81 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
82 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
83 {
84   /* This multiplication sequence is different sequence
85      than the CPU version. In particular, the diagonal block
86      multiplication kernel is launched in one stream. Then,
87      in a separate stream, the data transfers from DeviceToHost
88      (with MPI messaging in between), then HostToDevice are
89      launched. Once the data transfer stream is synchronized,
90      to ensure messaging is complete, the MatMultAdd kernel
91      is launched in the original (MatMult) stream to protect
92      against race conditions.
93 
94      This sequence should only be called for GPU computation. */
95   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
96   PetscErrorCode ierr;
97   PetscInt       nt;
98 
99   PetscFunctionBegin;
100   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
102   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
104   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
107   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
108   PetscFunctionReturn(0);
109 }
110 
111 #undef __FUNCT__
112 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
114 {
115   /* This multiplication sequence is different sequence
116      than the CPU version. In particular, the diagonal block
117      multiplication kernel is launched in one stream. Then,
118      in a separate stream, the data transfers from DeviceToHost
119      (with MPI messaging in between), then HostToDevice are
120      launched. Once the data transfer stream is synchronized,
121      to ensure messaging is complete, the MatMultAdd kernel
122      is launched in the original (MatMult) stream to protect
123      against race conditions.
124 
125      This sequence should only be called for GPU computation. */
126   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
127   PetscErrorCode ierr;
128   PetscInt       nt;
129 
130   PetscFunctionBegin;
131   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
132   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
133   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
134   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
135   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
136   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
138   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
139   PetscFunctionReturn(0);
140 }
141 
142 EXTERN_C_BEGIN
143 #undef __FUNCT__
144 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
145 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
146 {
147   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
148   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
149 
150   PetscFunctionBegin;
151   switch (op) {
152   case MAT_CUSPARSE_MULT_DIAG:
153     cusparseStruct->diagGPUMatFormat = format;
154     break;
155   case MAT_CUSPARSE_MULT_OFFDIAG:
156     cusparseStruct->offdiagGPUMatFormat = format;
157     break;
158   case MAT_CUSPARSE_ALL:
159     cusparseStruct->diagGPUMatFormat = format;
160     cusparseStruct->offdiagGPUMatFormat = format;
161     break;
162   default:
163     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
164   }
165   PetscFunctionReturn(0);
166 }
167 EXTERN_C_END
168 
169 
170 #undef __FUNCT__
171 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
172 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
173 {
174   MatCUSPARSEStorageFormat format;
175   PetscErrorCode     ierr;
176   PetscBool      flg;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
180   ierr = PetscObjectOptionsBegin((PetscObject)A);
181   if (A->factortype==MAT_FACTOR_NONE) {
182     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
183                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
184     if (flg) {
185       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
186     }
187     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
188                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189     if (flg) {
190       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
191     }
192     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
193                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
194     if (flg) {
195       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
196     }
197   }
198   ierr = PetscOptionsEnd();CHKERRQ(ierr);
199   PetscFunctionReturn(0);
200 }
201 
202 #undef __FUNCT__
203 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
204 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
205 {
206   PetscErrorCode ierr;
207   Mat_MPIAIJ *a  = (Mat_MPIAIJ*)A->data;
208   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
209 
210   PetscFunctionBegin;
211   try {
212     delete cusparseStruct;
213   } catch(char* ex) {
214     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
215   }
216   cusparseStruct = 0;
217   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
218   PetscFunctionReturn(0);
219 }
220 
221 EXTERN_C_BEGIN
222 #undef __FUNCT__
223 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
224 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
225 {
226   PetscErrorCode ierr;
227   Mat_MPIAIJ *a;
228   Mat_MPIAIJCUSPARSE * cusparseStruct;
229 
230   PetscFunctionBegin;
231   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
232   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
233                                            "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
234                                            MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
235   a  = (Mat_MPIAIJ*)A->data;
236   a->spptr                      = new Mat_MPIAIJCUSPARSE;
237   cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
238   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
239   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
240   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
241   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
242   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
243   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
244   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
245   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
246   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
247 
248   PetscFunctionReturn(0);
249 }
250 EXTERN_C_END
251 
252 /*@
253    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
254    (the default parallel PETSc format).  This matrix will ultimately pushed down
255    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
256    assembly performance the user should preallocate the matrix storage by setting
257    the parameter nz (or the array nnz).  By setting these parameters accurately,
258    performance during matrix assembly can be increased by more than a factor of 50.
259    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
260    to build/install PETSc to use different CUSPARSE base matrix types.
261 
262    Collective on MPI_Comm
263 
264    Input Parameters:
265 +  comm - MPI communicator, set to PETSC_COMM_SELF
266 .  m - number of rows
267 .  n - number of columns
268 .  nz - number of nonzeros per row (same for all rows)
269 -  nnz - array containing the number of nonzeros in the various rows
270          (possibly different for each row) or PETSC_NULL
271 
272    Output Parameter:
273 .  A - the matrix
274 
275    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
276    MatXXXXSetPreallocation() paradigm instead of this routine directly.
277    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
278 
279    Notes:
280    If nnz is given then nz is ignored
281 
282    The AIJ format (also called the Yale sparse matrix format or
283    compressed row storage), is fully compatible with standard Fortran 77
284    storage.  That is, the stored row and column indices can begin at
285    either one (as in Fortran) or zero.  See the users' manual for details.
286 
287    Specify the preallocated storage with either nz or nnz (not both).
288    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
289    allocation.  For large problems you MUST preallocate memory or you
290    will get TERRIBLE performance, see the users' manual chapter on matrices.
291 
292    By default, this format uses inodes (identical nodes) when possible, to
293    improve numerical efficiency of matrix-vector products and solves. We
294    search for consecutive rows with the same nonzero structure, thereby
295    reusing matrix information to achieve increased efficiency.
296 
297    Level: intermediate
298 
299 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
300 @*/
301 #undef __FUNCT__
302 #define __FUNCT__ "MatCreateAIJCUSPARSE"
303 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
304 {
305   PetscErrorCode ierr;
306   PetscMPIInt    size;
307 
308   PetscFunctionBegin;
309   ierr = MatCreate(comm,A);CHKERRQ(ierr);
310   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
311   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
312   if (size > 1) {
313     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
314     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
315   } else {
316     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
317     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
318   }
319   PetscFunctionReturn(0);
320 }
321 
322 /*M
323    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
324 
325    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
326    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. Use of the
327    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
328    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
329    GPU storage formats with CUSPARSE matrix types.
330 
331    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
332    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
333    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
334    for communicators controlling multiple processes.  It is recommended that you call both of
335    the above preallocation routines for simplicity.
336 
337    Options Database Keys:
338 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
339 .  -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions().
340 .  -mat_cusparse_mult_diag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal matrix during a call to MatSetFromOptions().
341 -  -mat_cusparse_mult_offdiag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of off-diagonal matrix during a call to MatSetFromOptions().
342 
343   Level: beginner
344 
345 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
346 M*/
347