xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision dcca6d9d80ebd869fe6029bd05a3aa9faafef49e)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
19   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
20   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
21   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
22 
23   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
24   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
25   if (d_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
28     }
29   }
30   if (o_nnz) {
31     for (i=0; i<B->rmap->n; i++) {
32       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
33     }
34   }
35   if (!B->preallocated) {
36     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
37     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
38     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
39     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
40     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
41     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
42     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
43     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
44     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
45   }
46   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
47   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
51   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
52   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
53   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
54 
55   B->preallocated = PETSC_TRUE;
56   PetscFunctionReturn(0);
57 }
58 
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
61 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
62 {
63   PetscErrorCode ierr;
64 
65   PetscFunctionBegin;
66   if (right) {
67     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
68     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
69     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
70     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
71     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
72   }
73   if (left) {
74     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
75     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
76     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
77     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
78     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
79 
80 
81   }
82   PetscFunctionReturn(0);
83 }
84 
85 
86 #undef __FUNCT__
87 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
88 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
89 {
90   /* This multiplication sequence is different sequence
91      than the CPU version. In particular, the diagonal block
92      multiplication kernel is launched in one stream. Then,
93      in a separate stream, the data transfers from DeviceToHost
94      (with MPI messaging in between), then HostToDevice are
95      launched. Once the data transfer stream is synchronized,
96      to ensure messaging is complete, the MatMultAdd kernel
97      is launched in the original (MatMult) stream to protect
98      against race conditions.
99 
100      This sequence should only be called for GPU computation. */
101   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
102   PetscErrorCode ierr;
103   PetscInt       nt;
104 
105   PetscFunctionBegin;
106   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
107   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
108   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
109   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
110   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
111   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
112   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
113   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
119 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
120 {
121   /* This multiplication sequence is different sequence
122      than the CPU version. In particular, the diagonal block
123      multiplication kernel is launched in one stream. Then,
124      in a separate stream, the data transfers from DeviceToHost
125      (with MPI messaging in between), then HostToDevice are
126      launched. Once the data transfer stream is synchronized,
127      to ensure messaging is complete, the MatMultAdd kernel
128      is launched in the original (MatMult) stream to protect
129      against race conditions.
130 
131      This sequence should only be called for GPU computation. */
132   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
133   PetscErrorCode ierr;
134   PetscInt       nt;
135 
136   PetscFunctionBegin;
137   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
138   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
139   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
140   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
141   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
142   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
143   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
144   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 #undef __FUNCT__
149 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
150 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
151 {
152   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
153   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
154 
155   PetscFunctionBegin;
156   switch (op) {
157   case MAT_CUSPARSE_MULT_DIAG:
158     cusparseStruct->diagGPUMatFormat = format;
159     break;
160   case MAT_CUSPARSE_MULT_OFFDIAG:
161     cusparseStruct->offdiagGPUMatFormat = format;
162     break;
163   case MAT_CUSPARSE_ALL:
164     cusparseStruct->diagGPUMatFormat    = format;
165     cusparseStruct->offdiagGPUMatFormat = format;
166     break;
167   default:
168     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
169   }
170   PetscFunctionReturn(0);
171 }
172 
173 #undef __FUNCT__
174 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
175 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
176 {
177   MatCUSPARSEStorageFormat format;
178   PetscErrorCode           ierr;
179   PetscBool                flg;
180 
181   PetscFunctionBegin;
182   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
183   ierr = PetscObjectOptionsBegin((PetscObject)A);
184   if (A->factortype==MAT_FACTOR_NONE) {
185     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
186                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187     if (flg) {
188       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
189     }
190     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
191                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
192     if (flg) {
193       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
194     }
195     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
196                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197     if (flg) {
198       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
199     }
200   }
201   ierr = PetscOptionsEnd();CHKERRQ(ierr);
202   PetscFunctionReturn(0);
203 }
204 
205 #undef __FUNCT__
206 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
207 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
208 {
209   PetscErrorCode     ierr;
210   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
211   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
212   cudaError_t        err;
213   cusparseStatus_t   stat;
214 
215   PetscFunctionBegin;
216   try {
217     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
218     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
219     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat);
220     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err);
221     delete cusparseStruct;
222   } catch(char *ex) {
223     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
224   }
225   cusparseStruct = 0;
226 
227   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 #undef __FUNCT__
232 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
233 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
234 {
235   PetscErrorCode     ierr;
236   Mat_MPIAIJ         *a;
237   Mat_MPIAIJCUSPARSE * cusparseStruct;
238   cudaError_t        err;
239   cusparseStatus_t   stat;
240 
241   PetscFunctionBegin;
242   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
243   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
244   a        = (Mat_MPIAIJ*)A->data;
245   a->spptr = new Mat_MPIAIJCUSPARSE;
246 
247   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
248   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
249   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
250   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat);
251   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err);
252 
253   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
254   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
255   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
256   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
257   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
258 
259   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
260   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 /*@
265    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
266    (the default parallel PETSc format).  This matrix will ultimately pushed down
267    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
268    assembly performance the user should preallocate the matrix storage by setting
269    the parameter nz (or the array nnz).  By setting these parameters accurately,
270    performance during matrix assembly can be increased by more than a factor of 50.
271 
272    Collective on MPI_Comm
273 
274    Input Parameters:
275 +  comm - MPI communicator, set to PETSC_COMM_SELF
276 .  m - number of rows
277 .  n - number of columns
278 .  nz - number of nonzeros per row (same for all rows)
279 -  nnz - array containing the number of nonzeros in the various rows
280          (possibly different for each row) or NULL
281 
282    Output Parameter:
283 .  A - the matrix
284 
285    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
286    MatXXXXSetPreallocation() paradigm instead of this routine directly.
287    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
288 
289    Notes:
290    If nnz is given then nz is ignored
291 
292    The AIJ format (also called the Yale sparse matrix format or
293    compressed row storage), is fully compatible with standard Fortran 77
294    storage.  That is, the stored row and column indices can begin at
295    either one (as in Fortran) or zero.  See the users' manual for details.
296 
297    Specify the preallocated storage with either nz or nnz (not both).
298    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
299    allocation.  For large problems you MUST preallocate memory or you
300    will get TERRIBLE performance, see the users' manual chapter on matrices.
301 
302    By default, this format uses inodes (identical nodes) when possible, to
303    improve numerical efficiency of matrix-vector products and solves. We
304    search for consecutive rows with the same nonzero structure, thereby
305    reusing matrix information to achieve increased efficiency.
306 
307    Level: intermediate
308 
309 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
310 @*/
311 #undef __FUNCT__
312 #define __FUNCT__ "MatCreateAIJCUSPARSE"
313 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
314 {
315   PetscErrorCode ierr;
316   PetscMPIInt    size;
317 
318   PetscFunctionBegin;
319   ierr = MatCreate(comm,A);CHKERRQ(ierr);
320   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
321   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
322   if (size > 1) {
323     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
324     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
325   } else {
326     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
327     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
328   }
329   PetscFunctionReturn(0);
330 }
331 
332 /*M
333    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
334 
335    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
336    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
337    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
338 
339    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
340    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
341    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
342    for communicators controlling multiple processes.  It is recommended that you call both of
343    the above preallocation routines for simplicity.
344 
345    Options Database Keys:
346 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
347 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
348 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
349 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
350 
351   Level: beginner
352 
353  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
354 M
355 M*/
356