xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision f416af30f06a2ff39a170f7c2ced34af851874d9)
1 #define PETSC_SKIP_COMPLEX
2 #define PETSC_SKIP_SPINLOCK
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
11 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
12 {
13   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
14   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
15   PetscErrorCode     ierr;
16   PetscInt           i;
17 
18   PetscFunctionBegin;
19   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
20   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
21   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
22   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
23 
24   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
26   if (d_nnz) {
27     for (i=0; i<B->rmap->n; i++) {
28       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29     }
30   }
31   if (o_nnz) {
32     for (i=0; i<B->rmap->n; i++) {
33       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34     }
35   }
36   if (!B->preallocated) {
37     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
40     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
42     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
43     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
44     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
45     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
46   }
47   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
48   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
52   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
53   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
54   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
55 
56   B->preallocated = PETSC_TRUE;
57   PetscFunctionReturn(0);
58 }
59 
60 #undef __FUNCT__
61 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE"
62 PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
63 {
64   PetscErrorCode ierr;
65   PetscInt rbs,cbs;
66 
67   PetscFunctionBegin;
68   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
69   if (right) {
70     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
71     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
72     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
73     ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr);
74     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
75   }
76   if (left) {
77     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
78     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
79     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
80     ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr);
81     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
82 
83 
84   }
85   PetscFunctionReturn(0);
86 }
87 
88 
89 #undef __FUNCT__
90 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
91 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
92 {
93   /* This multiplication sequence is different sequence
94      than the CPU version. In particular, the diagonal block
95      multiplication kernel is launched in one stream. Then,
96      in a separate stream, the data transfers from DeviceToHost
97      (with MPI messaging in between), then HostToDevice are
98      launched. Once the data transfer stream is synchronized,
99      to ensure messaging is complete, the MatMultAdd kernel
100      is launched in the original (MatMult) stream to protect
101      against race conditions.
102 
103      This sequence should only be called for GPU computation. */
104   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
105   PetscErrorCode ierr;
106   PetscInt       nt;
107 
108   PetscFunctionBegin;
109   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
110   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
111   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
112   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
113   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
114   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
115   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
116   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
117   PetscFunctionReturn(0);
118 }
119 
120 #undef __FUNCT__
121 #define __FUNCT__ "MatMultTranspose_MPIAIJCUSPARSE"
122 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
123 {
124   /* This multiplication sequence is different sequence
125      than the CPU version. In particular, the diagonal block
126      multiplication kernel is launched in one stream. Then,
127      in a separate stream, the data transfers from DeviceToHost
128      (with MPI messaging in between), then HostToDevice are
129      launched. Once the data transfer stream is synchronized,
130      to ensure messaging is complete, the MatMultAdd kernel
131      is launched in the original (MatMult) stream to protect
132      against race conditions.
133 
134      This sequence should only be called for GPU computation. */
135   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
136   PetscErrorCode ierr;
137   PetscInt       nt;
138 
139   PetscFunctionBegin;
140   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
141   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
142   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
143   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
144   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
145   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
146   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
147   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
148   PetscFunctionReturn(0);
149 }
150 
151 #undef __FUNCT__
152 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
153 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
154 {
155   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
156   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
157 
158   PetscFunctionBegin;
159   switch (op) {
160   case MAT_CUSPARSE_MULT_DIAG:
161     cusparseStruct->diagGPUMatFormat = format;
162     break;
163   case MAT_CUSPARSE_MULT_OFFDIAG:
164     cusparseStruct->offdiagGPUMatFormat = format;
165     break;
166   case MAT_CUSPARSE_ALL:
167     cusparseStruct->diagGPUMatFormat    = format;
168     cusparseStruct->offdiagGPUMatFormat = format;
169     break;
170   default:
171     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
172   }
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
178 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
179 {
180   MatCUSPARSEStorageFormat format;
181   PetscErrorCode           ierr;
182   PetscBool                flg;
183   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
184   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
185 
186   PetscFunctionBegin;
187   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
188   ierr = PetscObjectOptionsBegin((PetscObject)A);
189   if (A->factortype==MAT_FACTOR_NONE) {
190     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
191                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
192     if (flg) {
193       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
194     }
195     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
196                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197     if (flg) {
198       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
199     }
200     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
201                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
202     if (flg) {
203       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
204     }
205   }
206   ierr = PetscOptionsEnd();CHKERRQ(ierr);
207   PetscFunctionReturn(0);
208 }
209 
210 #undef __FUNCT__
211 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
212 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
213 {
214   PetscErrorCode     ierr;
215   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
216   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
217   cudaError_t        err;
218   cusparseStatus_t   stat;
219 
220   PetscFunctionBegin;
221   try {
222     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
223     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
224     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
225     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
226     delete cusparseStruct;
227   } catch(char *ex) {
228     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
229   }
230   cusparseStruct = 0;
231 
232   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
233   PetscFunctionReturn(0);
234 }
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
238 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
239 {
240   PetscErrorCode     ierr;
241   Mat_MPIAIJ         *a;
242   Mat_MPIAIJCUSPARSE * cusparseStruct;
243   cudaError_t        err;
244   cusparseStatus_t   stat;
245 
246   PetscFunctionBegin;
247   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
248   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
249   a        = (Mat_MPIAIJ*)A->data;
250   a->spptr = new Mat_MPIAIJCUSPARSE;
251 
252   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
253   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
254   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
255   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
256   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
257 
258   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
259   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
260   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
261   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
262   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
263 
264   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
265   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
266   PetscFunctionReturn(0);
267 }
268 
269 /*@
270    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
271    (the default parallel PETSc format).  This matrix will ultimately pushed down
272    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
273    assembly performance the user should preallocate the matrix storage by setting
274    the parameter nz (or the array nnz).  By setting these parameters accurately,
275    performance during matrix assembly can be increased by more than a factor of 50.
276 
277    Collective on MPI_Comm
278 
279    Input Parameters:
280 +  comm - MPI communicator, set to PETSC_COMM_SELF
281 .  m - number of rows
282 .  n - number of columns
283 .  nz - number of nonzeros per row (same for all rows)
284 -  nnz - array containing the number of nonzeros in the various rows
285          (possibly different for each row) or NULL
286 
287    Output Parameter:
288 .  A - the matrix
289 
290    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
291    MatXXXXSetPreallocation() paradigm instead of this routine directly.
292    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
293 
294    Notes:
295    If nnz is given then nz is ignored
296 
297    The AIJ format (also called the Yale sparse matrix format or
298    compressed row storage), is fully compatible with standard Fortran 77
299    storage.  That is, the stored row and column indices can begin at
300    either one (as in Fortran) or zero.  See the users' manual for details.
301 
302    Specify the preallocated storage with either nz or nnz (not both).
303    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
304    allocation.  For large problems you MUST preallocate memory or you
305    will get TERRIBLE performance, see the users' manual chapter on matrices.
306 
307    By default, this format uses inodes (identical nodes) when possible, to
308    improve numerical efficiency of matrix-vector products and solves. We
309    search for consecutive rows with the same nonzero structure, thereby
310    reusing matrix information to achieve increased efficiency.
311 
312    Level: intermediate
313 
314 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
315 @*/
316 #undef __FUNCT__
317 #define __FUNCT__ "MatCreateAIJCUSPARSE"
318 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
319 {
320   PetscErrorCode ierr;
321   PetscMPIInt    size;
322 
323   PetscFunctionBegin;
324   ierr = MatCreate(comm,A);CHKERRQ(ierr);
325   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
326   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
327   if (size > 1) {
328     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
329     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
330   } else {
331     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
332     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
333   }
334   PetscFunctionReturn(0);
335 }
336 
337 /*M
338    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
339 
340    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
341    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
342    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
343 
344    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
345    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
346    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
347    for communicators controlling multiple processes.  It is recommended that you call both of
348    the above preallocation routines for simplicity.
349 
350    Options Database Keys:
351 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
352 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
353 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
354 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
355 
356   Level: beginner
357 
358  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
359 M
360 M*/
361