xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 98c3331e5c81c4bfa5036a9b6bc521ec2d439166)
1 #include <petscconf.h>
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
19   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
20   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
21   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
22 
23   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
25   if (d_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
28     }
29   }
30   if (o_nnz) {
31     for (i=0; i<B->rmap->n; i++) {
32       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
33     }
34   }
35   if (!B->preallocated) {
36     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
37     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
38     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
39     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
40     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
41     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
42     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
43     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
44     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
45   }
46   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
47   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
51   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
52   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
53   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
54 
55   B->preallocated = PETSC_TRUE;
56   PetscFunctionReturn(0);
57 }
58 
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
61 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
62 {
63   PetscErrorCode ierr;
64   PetscInt rbs,cbs;
65 
66   PetscFunctionBegin;
67   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
68   if (right) {
69     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
70     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
71     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
72     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
73     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
74   }
75   if (left) {
76     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
77     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
78     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
79     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
80     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
81 
82 
83   }
84   PetscFunctionReturn(0);
85 }
86 
87 
88 #undef __FUNCT__
89 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
90 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
91 {
92   /* This multiplication sequence is different sequence
93      than the CPU version. In particular, the diagonal block
94      multiplication kernel is launched in one stream. Then,
95      in a separate stream, the data transfers from DeviceToHost
96      (with MPI messaging in between), then HostToDevice are
97      launched. Once the data transfer stream is synchronized,
98      to ensure messaging is complete, the MatMultAdd kernel
99      is launched in the original (MatMult) stream to protect
100      against race conditions.
101 
102      This sequence should only be called for GPU computation. */
103   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
104   PetscErrorCode ierr;
105   PetscInt       nt;
106 
107   PetscFunctionBegin;
108   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
109   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
110   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
111   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
112   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
113   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
114   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
115   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
116   PetscFunctionReturn(0);
117 }
118 
119 #undef __FUNCT__
120 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
121 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
122 {
123   /* This multiplication sequence is different sequence
124      than the CPU version. In particular, the diagonal block
125      multiplication kernel is launched in one stream. Then,
126      in a separate stream, the data transfers from DeviceToHost
127      (with MPI messaging in between), then HostToDevice are
128      launched. Once the data transfer stream is synchronized,
129      to ensure messaging is complete, the MatMultAdd kernel
130      is launched in the original (MatMult) stream to protect
131      against race conditions.
132 
133      This sequence should only be called for GPU computation. */
134   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
135   PetscErrorCode ierr;
136   PetscInt       nt;
137 
138   PetscFunctionBegin;
139   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
140   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
141   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
142   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
143   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
144   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
145   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
146   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
147   PetscFunctionReturn(0);
148 }
149 
150 #undef __FUNCT__
151 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
152 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
153 {
154   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
155   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
156 
157   PetscFunctionBegin;
158   switch (op) {
159   case MAT_CUSPARSE_MULT_DIAG:
160     cusparseStruct->diagGPUMatFormat = format;
161     break;
162   case MAT_CUSPARSE_MULT_OFFDIAG:
163     cusparseStruct->offdiagGPUMatFormat = format;
164     break;
165   case MAT_CUSPARSE_ALL:
166     cusparseStruct->diagGPUMatFormat    = format;
167     cusparseStruct->offdiagGPUMatFormat = format;
168     break;
169   default:
170     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
171   }
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
177 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
178 {
179   MatCUSPARSEStorageFormat format;
180   PetscErrorCode           ierr;
181   PetscBool                flg;
182 
183   PetscFunctionBegin;
184   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
185   ierr = PetscObjectOptionsBegin((PetscObject)A);
186   if (A->factortype==MAT_FACTOR_NONE) {
187     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
188                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189     if (flg) {
190       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
191     }
192     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
193                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
194     if (flg) {
195       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
196     }
197     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
198                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
199     if (flg) {
200       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
201     }
202   }
203   ierr = PetscOptionsEnd();CHKERRQ(ierr);
204   PetscFunctionReturn(0);
205 }
206 
207 #undef __FUNCT__
208 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
209 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
210 {
211   PetscErrorCode     ierr;
212   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
213   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
214   cudaError_t        err;
215   cusparseStatus_t   stat;
216 
217   PetscFunctionBegin;
218   try {
219     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
220     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
221     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat);
222     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err);
223     delete cusparseStruct;
224   } catch(char *ex) {
225     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
226   }
227   cusparseStruct = 0;
228 
229   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
230   PetscFunctionReturn(0);
231 }
232 
233 #undef __FUNCT__
234 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
235 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
236 {
237   PetscErrorCode     ierr;
238   Mat_MPIAIJ         *a;
239   Mat_MPIAIJCUSPARSE * cusparseStruct;
240   cudaError_t        err;
241   cusparseStatus_t   stat;
242 
243   PetscFunctionBegin;
244   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
245   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
246   a        = (Mat_MPIAIJ*)A->data;
247   a->spptr = new Mat_MPIAIJCUSPARSE;
248 
249   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
250   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
251   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
252   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat);
253   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err);
254 
255   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
256   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
257   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
258   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
259   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
260 
261   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
262   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
263   PetscFunctionReturn(0);
264 }
265 
266 /*@
267    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
268    (the default parallel PETSc format).  This matrix will ultimately pushed down
269    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
270    assembly performance the user should preallocate the matrix storage by setting
271    the parameter nz (or the array nnz).  By setting these parameters accurately,
272    performance during matrix assembly can be increased by more than a factor of 50.
273 
274    Collective on MPI_Comm
275 
276    Input Parameters:
277 +  comm - MPI communicator, set to PETSC_COMM_SELF
278 .  m - number of rows
279 .  n - number of columns
280 .  nz - number of nonzeros per row (same for all rows)
281 -  nnz - array containing the number of nonzeros in the various rows
282          (possibly different for each row) or NULL
283 
284    Output Parameter:
285 .  A - the matrix
286 
287    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
288    MatXXXXSetPreallocation() paradigm instead of this routine directly.
289    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
290 
291    Notes:
292    If nnz is given then nz is ignored
293 
294    The AIJ format (also called the Yale sparse matrix format or
295    compressed row storage), is fully compatible with standard Fortran 77
296    storage.  That is, the stored row and column indices can begin at
297    either one (as in Fortran) or zero.  See the users' manual for details.
298 
299    Specify the preallocated storage with either nz or nnz (not both).
300    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
301    allocation.  For large problems you MUST preallocate memory or you
302    will get TERRIBLE performance, see the users' manual chapter on matrices.
303 
304    By default, this format uses inodes (identical nodes) when possible, to
305    improve numerical efficiency of matrix-vector products and solves. We
306    search for consecutive rows with the same nonzero structure, thereby
307    reusing matrix information to achieve increased efficiency.
308 
309    Level: intermediate
310 
311 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
312 @*/
313 #undef __FUNCT__
314 #define __FUNCT__ "MatCreateAIJCUSPARSE"
315 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
316 {
317   PetscErrorCode ierr;
318   PetscMPIInt    size;
319 
320   PetscFunctionBegin;
321   ierr = MatCreate(comm,A);CHKERRQ(ierr);
322   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
323   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
324   if (size > 1) {
325     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
326     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
327   } else {
328     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
329     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
330   }
331   PetscFunctionReturn(0);
332 }
333 
334 /*M
335    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
336 
337    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
338    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
339    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
340 
341    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
342    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
343    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
344    for communicators controlling multiple processes.  It is recommended that you call both of
345    the above preallocation routines for simplicity.
346 
347    Options Database Keys:
348 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
349 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
350 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
351 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
352 
353   Level: beginner
354 
355  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
356 M
357 M*/
358