xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 665c2ded495bb9782a7454dcfef3abf1536c3670)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode     ierr;
14   PetscInt           i;
15 
16   PetscFunctionBegin;
17   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
18   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
19   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
20   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
21 
22   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
23   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
26   if (d_nnz) {
27     for (i=0; i<B->rmap->n; i++) {
28       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29     }
30   }
31   if (o_nnz) {
32     for (i=0; i<B->rmap->n; i++) {
33       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34     }
35   }
36   if (!B->preallocated) {
37     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
40     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
42     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
43     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
44     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
45     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
46   }
47   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
48   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51 
52   B->preallocated = PETSC_TRUE;
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
58 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
59 {
60   PetscErrorCode ierr;
61 
62   PetscFunctionBegin;
63   if (right) {
64     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
65     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
66     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
67     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
68     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
69   }
70   if (left) {
71     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
72     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
73     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
74     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
75     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
76 
77 
78   }
79   PetscFunctionReturn(0);
80 }
81 
82 
83 #undef __FUNCT__
84 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
85 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
86 {
87   /* This multiplication sequence is different sequence
88      than the CPU version. In particular, the diagonal block
89      multiplication kernel is launched in one stream. Then,
90      in a separate stream, the data transfers from DeviceToHost
91      (with MPI messaging in between), then HostToDevice are
92      launched. Once the data transfer stream is synchronized,
93      to ensure messaging is complete, the MatMultAdd kernel
94      is launched in the original (MatMult) stream to protect
95      against race conditions.
96 
97      This sequence should only be called for GPU computation. */
98   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
99   PetscErrorCode ierr;
100   PetscInt       nt;
101 
102   PetscFunctionBegin;
103   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
104   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
105   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
107   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
109   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
110   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
111   PetscFunctionReturn(0);
112 }
113 
114 #undef __FUNCT__
115 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
116 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
117 {
118   /* This multiplication sequence is different sequence
119      than the CPU version. In particular, the diagonal block
120      multiplication kernel is launched in one stream. Then,
121      in a separate stream, the data transfers from DeviceToHost
122      (with MPI messaging in between), then HostToDevice are
123      launched. Once the data transfer stream is synchronized,
124      to ensure messaging is complete, the MatMultAdd kernel
125      is launched in the original (MatMult) stream to protect
126      against race conditions.
127 
128      This sequence should only be called for GPU computation. */
129   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
130   PetscErrorCode ierr;
131   PetscInt       nt;
132 
133   PetscFunctionBegin;
134   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
135   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
136   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
138   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
139   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
140   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
141   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
142   PetscFunctionReturn(0);
143 }
144 
145 EXTERN_C_BEGIN
146 #undef __FUNCT__
147 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
148 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
149 {
150   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
151   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
152 
153   PetscFunctionBegin;
154   switch (op) {
155   case MAT_CUSPARSE_MULT_DIAG:
156     cusparseStruct->diagGPUMatFormat = format;
157     break;
158   case MAT_CUSPARSE_MULT_OFFDIAG:
159     cusparseStruct->offdiagGPUMatFormat = format;
160     break;
161   case MAT_CUSPARSE_ALL:
162     cusparseStruct->diagGPUMatFormat    = format;
163     cusparseStruct->offdiagGPUMatFormat = format;
164     break;
165   default:
166     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
167   }
168   PetscFunctionReturn(0);
169 }
170 EXTERN_C_END
171 
172 
173 #undef __FUNCT__
174 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
175 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
176 {
177   MatCUSPARSEStorageFormat format;
178   PetscErrorCode           ierr;
179   PetscBool                flg;
180 
181   PetscFunctionBegin;
182   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
183   ierr = PetscObjectOptionsBegin((PetscObject)A);
184   if (A->factortype==MAT_FACTOR_NONE) {
185     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
186                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187     if (flg) {
188       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
189     }
190     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
191                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
192     if (flg) {
193       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
194     }
195     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
196                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197     if (flg) {
198       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
199     }
200   }
201   ierr = PetscOptionsEnd();CHKERRQ(ierr);
202   PetscFunctionReturn(0);
203 }
204 
205 #undef __FUNCT__
206 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
207 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
208 {
209   PetscErrorCode     ierr;
210   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
211   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
212 
213   PetscFunctionBegin;
214   try {
215     delete cusparseStruct;
216   } catch(char *ex) {
217     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
218   }
219   cusparseStruct = 0;
220 
221   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
222   PetscFunctionReturn(0);
223 }
224 
225 EXTERN_C_BEGIN
226 #undef __FUNCT__
227 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
228 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
229 {
230   PetscErrorCode     ierr;
231   Mat_MPIAIJ         *a;
232   Mat_MPIAIJCUSPARSE * cusparseStruct;
233 
234   PetscFunctionBegin;
235   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
236   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
237                                            "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
238                                            MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
239   a        = (Mat_MPIAIJ*)A->data;
240   a->spptr = new Mat_MPIAIJCUSPARSE;
241 
242   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
243   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
244   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
245 
246   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
247   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
248   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
249   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
250   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
251 
252   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
253   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
254   PetscFunctionReturn(0);
255 }
256 EXTERN_C_END
257 
258 /*@
259    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
260    (the default parallel PETSc format).  This matrix will ultimately pushed down
261    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
262    assembly performance the user should preallocate the matrix storage by setting
263    the parameter nz (or the array nnz).  By setting these parameters accurately,
264    performance during matrix assembly can be increased by more than a factor of 50.
265    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
266    to build/install PETSc to use different CUSPARSE base matrix types.
267 
268    Collective on MPI_Comm
269 
270    Input Parameters:
271 +  comm - MPI communicator, set to PETSC_COMM_SELF
272 .  m - number of rows
273 .  n - number of columns
274 .  nz - number of nonzeros per row (same for all rows)
275 -  nnz - array containing the number of nonzeros in the various rows
276          (possibly different for each row) or NULL
277 
278    Output Parameter:
279 .  A - the matrix
280 
281    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
282    MatXXXXSetPreallocation() paradigm instead of this routine directly.
283    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
284 
285    Notes:
286    If nnz is given then nz is ignored
287 
288    The AIJ format (also called the Yale sparse matrix format or
289    compressed row storage), is fully compatible with standard Fortran 77
290    storage.  That is, the stored row and column indices can begin at
291    either one (as in Fortran) or zero.  See the users' manual for details.
292 
293    Specify the preallocated storage with either nz or nnz (not both).
294    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
295    allocation.  For large problems you MUST preallocate memory or you
296    will get TERRIBLE performance, see the users' manual chapter on matrices.
297 
298    By default, this format uses inodes (identical nodes) when possible, to
299    improve numerical efficiency of matrix-vector products and solves. We
300    search for consecutive rows with the same nonzero structure, thereby
301    reusing matrix information to achieve increased efficiency.
302 
303    Level: intermediate
304 
305 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
306 @*/
307 #undef __FUNCT__
308 #define __FUNCT__ "MatCreateAIJCUSPARSE"
309 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
310 {
311   PetscErrorCode ierr;
312   PetscMPIInt    size;
313 
314   PetscFunctionBegin;
315   ierr = MatCreate(comm,A);CHKERRQ(ierr);
316   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
317   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
318   if (size > 1) {
319     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
320     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
321   } else {
322     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
323     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
324   }
325   PetscFunctionReturn(0);
326 }
327 
328 /*M
329    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
330 
331    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
332    All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the
333    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
334    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
335    GPU storage formats with CUSPARSE matrix types.
336 
337    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
338    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
339    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
340    for communicators controlling multiple processes.  It is recommended that you call both of
341    the above preallocation routines for simplicity.
342 
343    Options Database Keys:
344 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
345 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
346 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
347 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
348 
349   Level: beginner
350 
351  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
352 M
353 M*/
354