xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 047240e14af00aad1ef65e96f6fface8924f7f7e)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode     ierr;
14   PetscInt           i;
15 
16   PetscFunctionBegin;
17   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
18   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
19   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
20   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
21 
22   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
23   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
26   if (d_nnz) {
27     for (i=0; i<B->rmap->n; i++) {
28       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29     }
30   }
31   if (o_nnz) {
32     for (i=0; i<B->rmap->n; i++) {
33       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34     }
35   }
36   if (!B->preallocated) {
37     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
40     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
42     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
43     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
44     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
45     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
46   }
47   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
48   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51 
52   B->preallocated = PETSC_TRUE;
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
58 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
59 {
60   PetscErrorCode ierr;
61 
62   PetscFunctionBegin;
63   if (right) {
64     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
65     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
66     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
67     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
68     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
69   }
70   if (left) {
71     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
72     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
73     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
74     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
75     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
76   }
77   PetscFunctionReturn(0);
78 }
79 
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
83 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
84 {
85   /* This multiplication sequence is different sequence
86      than the CPU version. In particular, the diagonal block
87      multiplication kernel is launched in one stream. Then,
88      in a separate stream, the data transfers from DeviceToHost
89      (with MPI messaging in between), then HostToDevice are
90      launched. Once the data transfer stream is synchronized,
91      to ensure messaging is complete, the MatMultAdd kernel
92      is launched in the original (MatMult) stream to protect
93      against race conditions.
94 
95      This sequence should only be called for GPU computation. */
96   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
97   PetscErrorCode ierr;
98   PetscInt       nt;
99 
100   PetscFunctionBegin;
101   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
102   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
103   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
104   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
105   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
107   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
108   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 #undef __FUNCT__
113 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
114 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
115 {
116   /* This multiplication sequence is different sequence
117      than the CPU version. In particular, the diagonal block
118      multiplication kernel is launched in one stream. Then,
119      in a separate stream, the data transfers from DeviceToHost
120      (with MPI messaging in between), then HostToDevice are
121      launched. Once the data transfer stream is synchronized,
122      to ensure messaging is complete, the MatMultAdd kernel
123      is launched in the original (MatMult) stream to protect
124      against race conditions.
125 
126      This sequence should only be called for GPU computation. */
127   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
128   PetscErrorCode ierr;
129   PetscInt       nt;
130 
131   PetscFunctionBegin;
132   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
133   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
134   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
135   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
136   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
138   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
139   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
140   PetscFunctionReturn(0);
141 }
142 
143 EXTERN_C_BEGIN
144 #undef __FUNCT__
145 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
146 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
147 {
148   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
149   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
150 
151   PetscFunctionBegin;
152   switch (op) {
153   case MAT_CUSPARSE_MULT_DIAG:
154     cusparseStruct->diagGPUMatFormat = format;
155     break;
156   case MAT_CUSPARSE_MULT_OFFDIAG:
157     cusparseStruct->offdiagGPUMatFormat = format;
158     break;
159   case MAT_CUSPARSE_ALL:
160     cusparseStruct->diagGPUMatFormat    = format;
161     cusparseStruct->offdiagGPUMatFormat = format;
162     break;
163   default:
164     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
165   }
166   PetscFunctionReturn(0);
167 }
168 EXTERN_C_END
169 
170 
171 #undef __FUNCT__
172 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
173 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
174 {
175   MatCUSPARSEStorageFormat format;
176   PetscErrorCode           ierr;
177   PetscBool                flg;
178 
179   PetscFunctionBegin;
180   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
181   ierr = PetscObjectOptionsBegin((PetscObject)A);
182   if (A->factortype==MAT_FACTOR_NONE) {
183     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
184                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185     if (flg) {
186       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
187     }
188     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
189                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190     if (flg) {
191       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
192     }
193     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
194                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
195     if (flg) {
196       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
197     }
198   }
199   ierr = PetscOptionsEnd();CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNCT__
204 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
205 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
206 {
207   PetscErrorCode     ierr;
208   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
209   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
210 
211   PetscFunctionBegin;
212   try {
213     delete cusparseStruct;
214   } catch(char *ex) {
215     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
216   }
217   cusparseStruct = 0;
218 
219   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
220   PetscFunctionReturn(0);
221 }
222 
223 EXTERN_C_BEGIN
224 #undef __FUNCT__
225 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
226 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
227 {
228   PetscErrorCode     ierr;
229   Mat_MPIAIJ         *a;
230   Mat_MPIAIJCUSPARSE * cusparseStruct;
231 
232   PetscFunctionBegin;
233   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
234   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
235                                            "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
236                                            MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
237   a        = (Mat_MPIAIJ*)A->data;
238   a->spptr = new Mat_MPIAIJCUSPARSE;
239 
240   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
241   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
242   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
243 
244   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
245   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
246   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
247   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
248   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
249 
250   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
251   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
252   PetscFunctionReturn(0);
253 }
254 EXTERN_C_END
255 
256 /*@
257    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
258    (the default parallel PETSc format).  This matrix will ultimately pushed down
259    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
260    assembly performance the user should preallocate the matrix storage by setting
261    the parameter nz (or the array nnz).  By setting these parameters accurately,
262    performance during matrix assembly can be increased by more than a factor of 50.
263    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
264    to build/install PETSc to use different CUSPARSE base matrix types.
265 
266    Collective on MPI_Comm
267 
268    Input Parameters:
269 +  comm - MPI communicator, set to PETSC_COMM_SELF
270 .  m - number of rows
271 .  n - number of columns
272 .  nz - number of nonzeros per row (same for all rows)
273 -  nnz - array containing the number of nonzeros in the various rows
274          (possibly different for each row) or PETSC_NULL
275 
276    Output Parameter:
277 .  A - the matrix
278 
279    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
280    MatXXXXSetPreallocation() paradigm instead of this routine directly.
281    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
282 
283    Notes:
284    If nnz is given then nz is ignored
285 
286    The AIJ format (also called the Yale sparse matrix format or
287    compressed row storage), is fully compatible with standard Fortran 77
288    storage.  That is, the stored row and column indices can begin at
289    either one (as in Fortran) or zero.  See the users' manual for details.
290 
291    Specify the preallocated storage with either nz or nnz (not both).
292    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
293    allocation.  For large problems you MUST preallocate memory or you
294    will get TERRIBLE performance, see the users' manual chapter on matrices.
295 
296    By default, this format uses inodes (identical nodes) when possible, to
297    improve numerical efficiency of matrix-vector products and solves. We
298    search for consecutive rows with the same nonzero structure, thereby
299    reusing matrix information to achieve increased efficiency.
300 
301    Level: intermediate
302 
303 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
304 @*/
305 #undef __FUNCT__
306 #define __FUNCT__ "MatCreateAIJCUSPARSE"
307 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
308 {
309   PetscErrorCode ierr;
310   PetscMPIInt    size;
311 
312   PetscFunctionBegin;
313   ierr = MatCreate(comm,A);CHKERRQ(ierr);
314   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
315   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
316   if (size > 1) {
317     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
318     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
319   } else {
320     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
321     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
322   }
323   PetscFunctionReturn(0);
324 }
325 
326 /*M
327    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
328 
329    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
330    All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the
331    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
332    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
333    GPU storage formats with CUSPARSE matrix types.
334 
335    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
336    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
337    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
338    for communicators controlling multiple processes.  It is recommended that you call both of
339    the above preallocation routines for simplicity.
340 
341    Options Database Keys:
342 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
343 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
344 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
345 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
346 
347   Level: beginner
348 
349  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
350 M
351 M*/
352