xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 27aa7340ca9f5aec230511b1b22ffadb73880b07)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode ierr;
14   PetscInt       i;
15 
16   PetscFunctionBegin;
17   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
18   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
19   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
20   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
21 
22   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
23   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
26   if (d_nnz) {
27     for (i=0; i<B->rmap->n; i++) {
28       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29     }
30   }
31   if (o_nnz) {
32     for (i=0; i<B->rmap->n; i++) {
33       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34     }
35   }
36   if (!B->preallocated) {
37     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
40     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
42     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
43     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
44     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
45     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
46   }
47   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
48   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49   ierr=MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50   ierr=MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51   B->preallocated = PETSC_TRUE;
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNCT__
56 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
57 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
58 {
59   PetscErrorCode ierr;
60 
61   PetscFunctionBegin;
62   if (right) {
63     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
64     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
65     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
66     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
67     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
68   }
69   if (left) {
70     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
71     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
72     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
73     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
74     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
75   }
76   PetscFunctionReturn(0);
77 }
78 
79 
80 #undef __FUNCT__
81 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
82 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
83 {
84   /* This multiplication sequence is different sequence
85      than the CPU version. In particular, the diagonal block
86      multiplication kernel is launched in one stream. Then,
87      in a separate stream, the data transfers from DeviceToHost
88      (with MPI messaging in between), then HostToDevice are
89      launched. Once the data transfer stream is synchronized,
90      to ensure messaging is complete, the MatMultAdd kernel
91      is launched in the original (MatMult) stream to protect
92      against race conditions.
93 
94      This sequence should only be called for GPU computation. */
95   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
96   PetscErrorCode ierr;
97   PetscInt       nt;
98 
99   PetscFunctionBegin;
100   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
102   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
104   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
107   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
108   PetscFunctionReturn(0);
109 }
110 
111 #undef __FUNCT__
112 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
114 {
115   /* This multiplication sequence is different sequence
116      than the CPU version. In particular, the diagonal block
117      multiplication kernel is launched in one stream. Then,
118      in a separate stream, the data transfers from DeviceToHost
119      (with MPI messaging in between), then HostToDevice are
120      launched. Once the data transfer stream is synchronized,
121      to ensure messaging is complete, the MatMultAdd kernel
122      is launched in the original (MatMult) stream to protect
123      against race conditions.
124 
125      This sequence should only be called for GPU computation. */
126   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
127   PetscErrorCode ierr;
128   PetscInt       nt;
129 
130   PetscFunctionBegin;
131   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
132   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
133   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
134   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
135   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
136   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
138   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
139   PetscFunctionReturn(0);
140 }
141 
142 /*PetscErrorCode MatSetValuesBatch_MPIAIJCUSPARSE(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats); */
143 
144 EXTERN_C_BEGIN
145 #undef __FUNCT__
146 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
147 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
148 {
149   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
150   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
151 
152   PetscFunctionBegin;
153   switch (op) {
154   case MAT_CUSPARSE_MULT_DIAG:
155     cusparseStruct->diagGPUMatFormat = format;
156     break;
157   case MAT_CUSPARSE_MULT_OFFDIAG:
158     cusparseStruct->offdiagGPUMatFormat = format;
159     break;
160   case MAT_CUSPARSE_ALL:
161     cusparseStruct->diagGPUMatFormat = format;
162     cusparseStruct->offdiagGPUMatFormat = format;
163     break;
164   default:
165     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
166   }
167   PetscFunctionReturn(0);
168 }
169 EXTERN_C_END
170 
171 
172 #undef __FUNCT__
173 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
174 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
175 {
176   MatCUSPARSEStorageFormat format;
177   PetscErrorCode     ierr;
178   PetscBool      flg;
179   PetscFunctionBegin;
180   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
181   ierr = PetscObjectOptionsBegin((PetscObject)A);
182   if (A->factortype==MAT_FACTOR_NONE) {
183     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
184 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185     if (flg) {
186       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
187     }
188     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
189 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190     if (flg) {
191       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
192     }
193     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
194 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
195     if (flg) {
196       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
197     }
198   }
199   ierr = PetscOptionsEnd();CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNCT__
204 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
205 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
206 {
207   PetscErrorCode ierr;
208   Mat_MPIAIJ *a  = (Mat_MPIAIJ*)A->data;
209   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
210 
211   PetscFunctionBegin;
212   try {
213     delete cusparseStruct;
214   } catch(char* ex) {
215     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
216   }
217   cusparseStruct = 0;
218   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 EXTERN_C_BEGIN
223 #undef __FUNCT__
224 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
225 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
226 {
227   PetscErrorCode ierr;
228   Mat_MPIAIJ *a;
229   Mat_MPIAIJCUSPARSE * cusparseStruct;
230 
231   PetscFunctionBegin;
232   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
233   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
234 					   "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
235 					   MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
236   a  = (Mat_MPIAIJ*)A->data;
237   a->spptr                      = new Mat_MPIAIJCUSPARSE;
238   cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
239   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
240   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
241   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
242   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
243   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
244   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
245   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
246   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
247   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
248 
249   PetscFunctionReturn(0);
250 }
251 EXTERN_C_END
252 
253 /*@
254    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
255    (the default parallel PETSc format).  This matrix will ultimately pushed down
256    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
257    assembly performance the user should preallocate the matrix storage by setting
258    the parameter nz (or the array nnz).  By setting these parameters accurately,
259    performance during matrix assembly can be increased by more than a factor of 50.
260    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
261    to build/install PETSc to use different CUSPARSE base matrix types.
262 
263    Collective on MPI_Comm
264 
265    Input Parameters:
266 +  comm - MPI communicator, set to PETSC_COMM_SELF
267 .  m - number of rows
268 .  n - number of columns
269 .  nz - number of nonzeros per row (same for all rows)
270 -  nnz - array containing the number of nonzeros in the various rows
271          (possibly different for each row) or PETSC_NULL
272 
273    Output Parameter:
274 .  A - the matrix
275 
276    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
277    MatXXXXSetPreallocation() paradigm instead of this routine directly.
278    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
279 
280    Notes:
281    If nnz is given then nz is ignored
282 
283    The AIJ format (also called the Yale sparse matrix format or
284    compressed row storage), is fully compatible with standard Fortran 77
285    storage.  That is, the stored row and column indices can begin at
286    either one (as in Fortran) or zero.  See the users' manual for details.
287 
288    Specify the preallocated storage with either nz or nnz (not both).
289    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
290    allocation.  For large problems you MUST preallocate memory or you
291    will get TERRIBLE performance, see the users' manual chapter on matrices.
292 
293    By default, this format uses inodes (identical nodes) when possible, to
294    improve numerical efficiency of matrix-vector products and solves. We
295    search for consecutive rows with the same nonzero structure, thereby
296    reusing matrix information to achieve increased efficiency.
297 
298    Level: intermediate
299 
300 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
301 @*/
302 #undef __FUNCT__
303 #define __FUNCT__ "MatCreateAIJCUSPARSE"
304 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
305 {
306   PetscErrorCode ierr;
307   PetscMPIInt    size;
308 
309   PetscFunctionBegin;
310   ierr = MatCreate(comm,A);CHKERRQ(ierr);
311   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
312   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
313   if (size > 1) {
314     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
315     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
316   } else {
317     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
318     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
319   }
320   PetscFunctionReturn(0);
321 }
322 
323 /*M
324    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
325 
326    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
327    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. Use of the
328    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
329    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
330    GPU storage formats with CUSPARSE matrix types.
331 
332    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
333    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
334    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
335    for communicators controlling multiple processes.  It is recommended that you call both of
336    the above preallocation routines for simplicity.
337 
338    Options Database Keys:
339 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
340 .  -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions().
341 .  -mat_cusparse_mult_diag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal matrix during a call to MatSetFromOptions().
342 -  -mat_cusparse_mult_offdiag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of off-diagonal matrix during a call to MatSetFromOptions().
343 
344   Level: beginner
345 
346 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
347 M*/
348