xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 0005d66cf2be2b0fca19463d69605dfad837b879)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
9 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10 {
11   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
12   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13   PetscErrorCode     ierr;
14   PetscInt           i;
15 
16   PetscFunctionBegin;
17   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
18   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
19   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
20   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
21 
22   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
23   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
26   if (d_nnz) {
27     for (i=0; i<B->rmap->n; i++) {
28       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29     }
30   }
31   if (o_nnz) {
32     for (i=0; i<B->rmap->n; i++) {
33       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34     }
35   }
36   if (!B->preallocated) {
37     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
40     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
42     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
43     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
44     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
45     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
46   }
47   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
48   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51 
52   B->preallocated = PETSC_TRUE;
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
58 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
59 {
60   PetscErrorCode ierr;
61 
62   PetscFunctionBegin;
63   if (right) {
64     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
65     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
66     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
67     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
68     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
69   }
70   if (left) {
71     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
72     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
73     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
74     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
75     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
76 
77 
78   }
79   PetscFunctionReturn(0);
80 }
81 
82 
83 #undef __FUNCT__
84 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
85 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
86 {
87   /* This multiplication sequence is different sequence
88      than the CPU version. In particular, the diagonal block
89      multiplication kernel is launched in one stream. Then,
90      in a separate stream, the data transfers from DeviceToHost
91      (with MPI messaging in between), then HostToDevice are
92      launched. Once the data transfer stream is synchronized,
93      to ensure messaging is complete, the MatMultAdd kernel
94      is launched in the original (MatMult) stream to protect
95      against race conditions.
96 
97      This sequence should only be called for GPU computation. */
98   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
99   PetscErrorCode ierr;
100   PetscInt       nt;
101 
102   PetscFunctionBegin;
103   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
104   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
105   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
107   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
109   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
110   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
111   PetscFunctionReturn(0);
112 }
113 
114 #undef __FUNCT__
115 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
116 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
117 {
118   /* This multiplication sequence is different sequence
119      than the CPU version. In particular, the diagonal block
120      multiplication kernel is launched in one stream. Then,
121      in a separate stream, the data transfers from DeviceToHost
122      (with MPI messaging in between), then HostToDevice are
123      launched. Once the data transfer stream is synchronized,
124      to ensure messaging is complete, the MatMultAdd kernel
125      is launched in the original (MatMult) stream to protect
126      against race conditions.
127 
128      This sequence should only be called for GPU computation. */
129   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
130   PetscErrorCode ierr;
131   PetscInt       nt;
132 
133   PetscFunctionBegin;
134   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
135   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
136   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
138   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
139   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
140   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
141   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
142   PetscFunctionReturn(0);
143 }
144 
145 #undef __FUNCT__
146 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
147 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
148 {
149   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
150   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
151 
152   PetscFunctionBegin;
153   switch (op) {
154   case MAT_CUSPARSE_MULT_DIAG:
155     cusparseStruct->diagGPUMatFormat = format;
156     break;
157   case MAT_CUSPARSE_MULT_OFFDIAG:
158     cusparseStruct->offdiagGPUMatFormat = format;
159     break;
160   case MAT_CUSPARSE_ALL:
161     cusparseStruct->diagGPUMatFormat    = format;
162     cusparseStruct->offdiagGPUMatFormat = format;
163     break;
164   default:
165     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
166   }
167   PetscFunctionReturn(0);
168 }
169 
170 #undef __FUNCT__
171 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
172 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
173 {
174   MatCUSPARSEStorageFormat format;
175   PetscErrorCode           ierr;
176   PetscBool                flg;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
180   ierr = PetscObjectOptionsBegin((PetscObject)A);
181   if (A->factortype==MAT_FACTOR_NONE) {
182     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
183                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
184     if (flg) {
185       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
186     }
187     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
188                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189     if (flg) {
190       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
191     }
192     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
193                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
194     if (flg) {
195       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
196     }
197   }
198   ierr = PetscOptionsEnd();CHKERRQ(ierr);
199   PetscFunctionReturn(0);
200 }
201 
202 #undef __FUNCT__
203 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
204 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
205 {
206   PetscErrorCode     ierr;
207   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
208   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
209 
210   PetscFunctionBegin;
211   try {
212     delete cusparseStruct;
213   } catch(char *ex) {
214     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
215   }
216   cusparseStruct = 0;
217 
218   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 #undef __FUNCT__
223 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
224 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
225 {
226   PetscErrorCode     ierr;
227   Mat_MPIAIJ         *a;
228   Mat_MPIAIJCUSPARSE * cusparseStruct;
229 
230   PetscFunctionBegin;
231   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
232   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
233   a        = (Mat_MPIAIJ*)A->data;
234   a->spptr = new Mat_MPIAIJCUSPARSE;
235 
236   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
237   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
238   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
239 
240   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
241   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
242   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
243   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
244   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
245 
246   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
247   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
248   PetscFunctionReturn(0);
249 }
250 
251 /*@
252    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
253    (the default parallel PETSc format).  This matrix will ultimately pushed down
254    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
255    assembly performance the user should preallocate the matrix storage by setting
256    the parameter nz (or the array nnz).  By setting these parameters accurately,
257    performance during matrix assembly can be increased by more than a factor of 50.
258    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
259    to build/install PETSc to use different CUSPARSE base matrix types.
260 
261    Collective on MPI_Comm
262 
263    Input Parameters:
264 +  comm - MPI communicator, set to PETSC_COMM_SELF
265 .  m - number of rows
266 .  n - number of columns
267 .  nz - number of nonzeros per row (same for all rows)
268 -  nnz - array containing the number of nonzeros in the various rows
269          (possibly different for each row) or NULL
270 
271    Output Parameter:
272 .  A - the matrix
273 
274    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
275    MatXXXXSetPreallocation() paradigm instead of this routine directly.
276    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
277 
278    Notes:
279    If nnz is given then nz is ignored
280 
281    The AIJ format (also called the Yale sparse matrix format or
282    compressed row storage), is fully compatible with standard Fortran 77
283    storage.  That is, the stored row and column indices can begin at
284    either one (as in Fortran) or zero.  See the users' manual for details.
285 
286    Specify the preallocated storage with either nz or nnz (not both).
287    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
288    allocation.  For large problems you MUST preallocate memory or you
289    will get TERRIBLE performance, see the users' manual chapter on matrices.
290 
291    By default, this format uses inodes (identical nodes) when possible, to
292    improve numerical efficiency of matrix-vector products and solves. We
293    search for consecutive rows with the same nonzero structure, thereby
294    reusing matrix information to achieve increased efficiency.
295 
296    Level: intermediate
297 
298 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
299 @*/
300 #undef __FUNCT__
301 #define __FUNCT__ "MatCreateAIJCUSPARSE"
302 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
303 {
304   PetscErrorCode ierr;
305   PetscMPIInt    size;
306 
307   PetscFunctionBegin;
308   ierr = MatCreate(comm,A);CHKERRQ(ierr);
309   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
310   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
311   if (size > 1) {
312     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
313     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
314   } else {
315     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
316     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
317   }
318   PetscFunctionReturn(0);
319 }
320 
321 /*M
322    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
323 
324    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
325    All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the
326    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
327    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
328    GPU storage formats with CUSPARSE matrix types.
329 
330    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
331    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
332    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
333    for communicators controlling multiple processes.  It is recommended that you call both of
334    the above preallocation routines for simplicity.
335 
336    Options Database Keys:
337 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
338 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
339 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
340 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
341 
342   Level: beginner
343 
344  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
345 M
346 M*/
347