xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 00de8ff0695ff394d09a2c60082aeaab5870b6e2)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode     ierr;
14   PetscInt           i;
15 
16   PetscFunctionBegin;
17   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
18   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
19   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
20   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
21 
22   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
23   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
26   if (d_nnz) {
27     for (i=0; i<B->rmap->n; i++) {
28       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29     }
30   }
31   if (o_nnz) {
32     for (i=0; i<B->rmap->n; i++) {
33       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34     }
35   }
36   if (!B->preallocated) {
37     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
40     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
42     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
43     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
44     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
45     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
46   }
47   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
48   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51 
52   B->preallocated = PETSC_TRUE;
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
58 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
59 {
60   PetscErrorCode ierr;
61 
62   PetscFunctionBegin;
63   if (right) {
64     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
65     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
66     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
67     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
68     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
69   }
70   if (left) {
71     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
72     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
73     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
74     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
75     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
76 
77 
78   }
79   PetscFunctionReturn(0);
80 }
81 
82 
83 #undef __FUNCT__
84 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
85 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
86 {
87   /* This multiplication sequence is different sequence
88      than the CPU version. In particular, the diagonal block
89      multiplication kernel is launched in one stream. Then,
90      in a separate stream, the data transfers from DeviceToHost
91      (with MPI messaging in between), then HostToDevice are
92      launched. Once the data transfer stream is synchronized,
93      to ensure messaging is complete, the MatMultAdd kernel
94      is launched in the original (MatMult) stream to protect
95      against race conditions.
96 
97      This sequence should only be called for GPU computation. */
98   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
99   PetscErrorCode ierr;
100   PetscInt       nt;
101 
102   PetscFunctionBegin;
103   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
104   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
105   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
107   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
109   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
110   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
111   PetscFunctionReturn(0);
112 }
113 
114 #undef __FUNCT__
115 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
116 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
117 {
118   /* This multiplication sequence is different sequence
119      than the CPU version. In particular, the diagonal block
120      multiplication kernel is launched in one stream. Then,
121      in a separate stream, the data transfers from DeviceToHost
122      (with MPI messaging in between), then HostToDevice are
123      launched. Once the data transfer stream is synchronized,
124      to ensure messaging is complete, the MatMultAdd kernel
125      is launched in the original (MatMult) stream to protect
126      against race conditions.
127 
128      This sequence should only be called for GPU computation. */
129   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
130   PetscErrorCode ierr;
131   PetscInt       nt;
132 
133   PetscFunctionBegin;
134   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
135   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
136   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
138   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
139   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
140   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
141   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
142   PetscFunctionReturn(0);
143 }
144 
145 EXTERN_C_BEGIN
146 #undef __FUNCT__
147 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
148 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
149 {
150   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
151   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
152 
153   PetscFunctionBegin;
154   switch (op) {
155   case MAT_CUSPARSE_MULT_DIAG:
156     cusparseStruct->diagGPUMatFormat = format;
157     break;
158   case MAT_CUSPARSE_MULT_OFFDIAG:
159     cusparseStruct->offdiagGPUMatFormat = format;
160     break;
161   case MAT_CUSPARSE_ALL:
162     cusparseStruct->diagGPUMatFormat    = format;
163     cusparseStruct->offdiagGPUMatFormat = format;
164     break;
165   default:
166     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
167   }
168   PetscFunctionReturn(0);
169 }
170 EXTERN_C_END
171 
172 
173 #undef __FUNCT__
174 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
175 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
176 {
177   MatCUSPARSEStorageFormat format;
178   PetscErrorCode           ierr;
179   PetscBool                flg;
180 
181   PetscFunctionBegin;
182   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
183   ierr = PetscObjectOptionsBegin((PetscObject)A);
184   if (A->factortype==MAT_FACTOR_NONE) {
185     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
186                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187     if (flg) {
188       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
189     }
190     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
191                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
192     if (flg) {
193       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
194     }
195     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
196                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197     if (flg) {
198       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
199     }
200   }
201   ierr = PetscOptionsEnd();CHKERRQ(ierr);
202   PetscFunctionReturn(0);
203 }
204 
205 #undef __FUNCT__
206 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
207 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
208 {
209   PetscErrorCode     ierr;
210   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
211   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
212 
213   PetscFunctionBegin;
214   try {
215     delete cusparseStruct;
216   } catch(char *ex) {
217     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
218   }
219   cusparseStruct = 0;
220 
221   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
222   PetscFunctionReturn(0);
223 }
224 
225 EXTERN_C_BEGIN
226 #undef __FUNCT__
227 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
228 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
229 {
230   PetscErrorCode     ierr;
231   Mat_MPIAIJ         *a;
232   Mat_MPIAIJCUSPARSE * cusparseStruct;
233 
234   PetscFunctionBegin;
235   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
236   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C","MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
237   a        = (Mat_MPIAIJ*)A->data;
238   a->spptr = new Mat_MPIAIJCUSPARSE;
239 
240   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
241   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
242   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
243 
244   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
245   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
246   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
247   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
248   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
249 
250   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
251   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
252   PetscFunctionReturn(0);
253 }
254 EXTERN_C_END
255 
256 /*@
257    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
258    (the default parallel PETSc format).  This matrix will ultimately pushed down
259    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
260    assembly performance the user should preallocate the matrix storage by setting
261    the parameter nz (or the array nnz).  By setting these parameters accurately,
262    performance during matrix assembly can be increased by more than a factor of 50.
263    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
264    to build/install PETSc to use different CUSPARSE base matrix types.
265 
266    Collective on MPI_Comm
267 
268    Input Parameters:
269 +  comm - MPI communicator, set to PETSC_COMM_SELF
270 .  m - number of rows
271 .  n - number of columns
272 .  nz - number of nonzeros per row (same for all rows)
273 -  nnz - array containing the number of nonzeros in the various rows
274          (possibly different for each row) or NULL
275 
276    Output Parameter:
277 .  A - the matrix
278 
279    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
280    MatXXXXSetPreallocation() paradigm instead of this routine directly.
281    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
282 
283    Notes:
284    If nnz is given then nz is ignored
285 
286    The AIJ format (also called the Yale sparse matrix format or
287    compressed row storage), is fully compatible with standard Fortran 77
288    storage.  That is, the stored row and column indices can begin at
289    either one (as in Fortran) or zero.  See the users' manual for details.
290 
291    Specify the preallocated storage with either nz or nnz (not both).
292    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
293    allocation.  For large problems you MUST preallocate memory or you
294    will get TERRIBLE performance, see the users' manual chapter on matrices.
295 
296    By default, this format uses inodes (identical nodes) when possible, to
297    improve numerical efficiency of matrix-vector products and solves. We
298    search for consecutive rows with the same nonzero structure, thereby
299    reusing matrix information to achieve increased efficiency.
300 
301    Level: intermediate
302 
303 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
304 @*/
305 #undef __FUNCT__
306 #define __FUNCT__ "MatCreateAIJCUSPARSE"
307 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
308 {
309   PetscErrorCode ierr;
310   PetscMPIInt    size;
311 
312   PetscFunctionBegin;
313   ierr = MatCreate(comm,A);CHKERRQ(ierr);
314   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
315   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
316   if (size > 1) {
317     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
318     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
319   } else {
320     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
321     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
322   }
323   PetscFunctionReturn(0);
324 }
325 
326 /*M
327    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
328 
329    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
330    All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the
331    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
332    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
333    GPU storage formats with CUSPARSE matrix types.
334 
335    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
336    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
337    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
338    for communicators controlling multiple processes.  It is recommended that you call both of
339    the above preallocation routines for simplicity.
340 
341    Options Database Keys:
342 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
343 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
344 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
345 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
346 
347   Level: beginner
348 
349  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
350 M
351 M*/
352