xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 5c8f6a953e7ed1c81f507d64aebddb11080b60e9)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode ierr;
14   PetscInt       i;
15 
16   PetscFunctionBegin;
17   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
18   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
19   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
20   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
21 
22   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
23   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
26   if (d_nnz) {
27     for (i=0; i<B->rmap->n; i++) {
28       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29     }
30   }
31   if (o_nnz) {
32     for (i=0; i<B->rmap->n; i++) {
33       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34     }
35   }
36   if (!B->preallocated) {
37     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
40     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
42     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
43     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
44     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
45     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
46   }
47   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
48   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49   ierr=MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50   ierr=MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51   B->preallocated = PETSC_TRUE;
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNCT__
56 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
57 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
58 {
59   PetscErrorCode ierr;
60 
61   PetscFunctionBegin;
62   if (right) {
63     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
64     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
65     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
66     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
67     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
68   }
69   if (left) {
70     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
71     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
72     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
73     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
74     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
75   }
76   PetscFunctionReturn(0);
77 }
78 
79 
80 #undef __FUNCT__
81 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
82 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
83 {
84   /* This multiplication sequence is different sequence
85      than the CPU version. In particular, the diagonal block
86      multiplication kernel is launched in one stream. Then,
87      in a separate stream, the data transfers from DeviceToHost
88      (with MPI messaging in between), then HostToDevice are
89      launched. Once the data transfer stream is synchronized,
90      to ensure messaging is complete, the MatMultAdd kernel
91      is launched in the original (MatMult) stream to protect
92      against race conditions.
93 
94      This sequence should only be called for GPU computation. */
95   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
96   PetscErrorCode ierr;
97   PetscInt       nt;
98 
99   PetscFunctionBegin;
100   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
102   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
104   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
107   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
108   PetscFunctionReturn(0);
109 }
110 
111 #undef __FUNCT__
112 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
114 {
115   /* This multiplication sequence is different sequence
116      than the CPU version. In particular, the diagonal block
117      multiplication kernel is launched in one stream. Then,
118      in a separate stream, the data transfers from DeviceToHost
119      (with MPI messaging in between), then HostToDevice are
120      launched. Once the data transfer stream is synchronized,
121      to ensure messaging is complete, the MatMultAdd kernel
122      is launched in the original (MatMult) stream to protect
123      against race conditions.
124 
125      This sequence should only be called for GPU computation. */
126   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
127   PetscErrorCode ierr;
128   PetscInt       nt;
129 
130   PetscFunctionBegin;
131   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
132   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
133   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
134   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
135   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
136   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
138   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
139   PetscFunctionReturn(0);
140 }
141 
142 EXTERN_C_BEGIN
143 #undef __FUNCT__
144 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
145 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
146 {
147   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
148   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
149 
150   PetscFunctionBegin;
151   switch (op) {
152   case MAT_CUSPARSE_MULT_DIAG:
153     cusparseStruct->diagGPUMatFormat = format;
154     break;
155   case MAT_CUSPARSE_MULT_OFFDIAG:
156     cusparseStruct->offdiagGPUMatFormat = format;
157     break;
158   case MAT_CUSPARSE_ALL:
159     cusparseStruct->diagGPUMatFormat = format;
160     cusparseStruct->offdiagGPUMatFormat = format;
161     break;
162   default:
163     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
164   }
165   PetscFunctionReturn(0);
166 }
167 EXTERN_C_END
168 
169 
170 #undef __FUNCT__
171 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
172 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
173 {
174   MatCUSPARSEStorageFormat format;
175   PetscErrorCode     ierr;
176   PetscBool      flg;
177   PetscFunctionBegin;
178   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
179   ierr = PetscObjectOptionsBegin((PetscObject)A);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
185     }
186     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
187                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
188     if (flg) {
189       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
190     }
191     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
192                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
193     if (flg) {
194       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
195     }
196   }
197   ierr = PetscOptionsEnd();CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNCT__
202 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
203 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
204 {
205   PetscErrorCode ierr;
206   Mat_MPIAIJ *a  = (Mat_MPIAIJ*)A->data;
207   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
208 
209   PetscFunctionBegin;
210   try {
211     delete cusparseStruct;
212   } catch(char* ex) {
213     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
214   }
215   cusparseStruct = 0;
216   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 EXTERN_C_BEGIN
221 #undef __FUNCT__
222 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
223 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
224 {
225   PetscErrorCode ierr;
226   Mat_MPIAIJ *a;
227   Mat_MPIAIJCUSPARSE * cusparseStruct;
228 
229   PetscFunctionBegin;
230   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
231   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
232                                            "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
233                                            MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
234   a  = (Mat_MPIAIJ*)A->data;
235   a->spptr                      = new Mat_MPIAIJCUSPARSE;
236   cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
237   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
238   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
239   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
240   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
241   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
242   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
243   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
244   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
245   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
246 
247   PetscFunctionReturn(0);
248 }
249 EXTERN_C_END
250 
251 /*@
252    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
253    (the default parallel PETSc format).  This matrix will ultimately pushed down
254    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
255    assembly performance the user should preallocate the matrix storage by setting
256    the parameter nz (or the array nnz).  By setting these parameters accurately,
257    performance during matrix assembly can be increased by more than a factor of 50.
258    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
259    to build/install PETSc to use different CUSPARSE base matrix types.
260 
261    Collective on MPI_Comm
262 
263    Input Parameters:
264 +  comm - MPI communicator, set to PETSC_COMM_SELF
265 .  m - number of rows
266 .  n - number of columns
267 .  nz - number of nonzeros per row (same for all rows)
268 -  nnz - array containing the number of nonzeros in the various rows
269          (possibly different for each row) or PETSC_NULL
270 
271    Output Parameter:
272 .  A - the matrix
273 
274    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
275    MatXXXXSetPreallocation() paradigm instead of this routine directly.
276    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
277 
278    Notes:
279    If nnz is given then nz is ignored
280 
281    The AIJ format (also called the Yale sparse matrix format or
282    compressed row storage), is fully compatible with standard Fortran 77
283    storage.  That is, the stored row and column indices can begin at
284    either one (as in Fortran) or zero.  See the users' manual for details.
285 
286    Specify the preallocated storage with either nz or nnz (not both).
287    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
288    allocation.  For large problems you MUST preallocate memory or you
289    will get TERRIBLE performance, see the users' manual chapter on matrices.
290 
291    By default, this format uses inodes (identical nodes) when possible, to
292    improve numerical efficiency of matrix-vector products and solves. We
293    search for consecutive rows with the same nonzero structure, thereby
294    reusing matrix information to achieve increased efficiency.
295 
296    Level: intermediate
297 
298 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
299 @*/
300 #undef __FUNCT__
301 #define __FUNCT__ "MatCreateAIJCUSPARSE"
302 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
303 {
304   PetscErrorCode ierr;
305   PetscMPIInt    size;
306 
307   PetscFunctionBegin;
308   ierr = MatCreate(comm,A);CHKERRQ(ierr);
309   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
310   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
311   if (size > 1) {
312     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
313     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
314   } else {
315     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
316     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
317   }
318   PetscFunctionReturn(0);
319 }
320 
321 /*M
322    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
323 
324    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
325    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. Use of the
326    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
327    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
328    GPU storage formats with CUSPARSE matrix types.
329 
330    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
331    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
332    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
333    for communicators controlling multiple processes.  It is recommended that you call both of
334    the above preallocation routines for simplicity.
335 
336    Options Database Keys:
337 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
338 .  -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions().
339 .  -mat_cusparse_mult_diag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal matrix during a call to MatSetFromOptions().
340 -  -mat_cusparse_mult_offdiag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of off-diagonal matrix during a call to MatSetFromOptions().
341 
342   Level: beginner
343 
344 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
345 M*/
346