xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision af0996ce37bc06907c37d8d91773840993d61e62)
1 #define PETSC_SKIP_COMPLEX
2 
3 #include <petscconf.h>
4 PETSC_CUDA_EXTERN_C_BEGIN
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 PETSC_CUDA_EXTERN_C_END
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
12 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
13 {
14   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
15   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
16   PetscErrorCode     ierr;
17   PetscInt           i;
18 
19   PetscFunctionBegin;
20   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
21   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
22   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
23   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
24 
25   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
26   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
27   if (d_nnz) {
28     for (i=0; i<B->rmap->n; i++) {
29       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
30     }
31   }
32   if (o_nnz) {
33     for (i=0; i<B->rmap->n; i++) {
34       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
35     }
36   }
37   if (!B->preallocated) {
38     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
39     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
40     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
41     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
42     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
43     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
44     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
45     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
46     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
47   }
48   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
49   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
51   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
52   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
53   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
54   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
55   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
56 
57   B->preallocated = PETSC_TRUE;
58   PetscFunctionReturn(0);
59 }
60 
61 #undef __FUNCT__
62 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE"
63 PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
64 {
65   PetscErrorCode ierr;
66   PetscInt rbs,cbs;
67 
68   PetscFunctionBegin;
69   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
70   if (right) {
71     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
72     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
73     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
74     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
75     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
76   }
77   if (left) {
78     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
79     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
80     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
81     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
82     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
83 
84 
85   }
86   PetscFunctionReturn(0);
87 }
88 
89 
90 #undef __FUNCT__
91 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
92 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
93 {
94   /* This multiplication sequence is different sequence
95      than the CPU version. In particular, the diagonal block
96      multiplication kernel is launched in one stream. Then,
97      in a separate stream, the data transfers from DeviceToHost
98      (with MPI messaging in between), then HostToDevice are
99      launched. Once the data transfer stream is synchronized,
100      to ensure messaging is complete, the MatMultAdd kernel
101      is launched in the original (MatMult) stream to protect
102      against race conditions.
103 
104      This sequence should only be called for GPU computation. */
105   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
106   PetscErrorCode ierr;
107   PetscInt       nt;
108 
109   PetscFunctionBegin;
110   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
111   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
112   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
113   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
114   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
115   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
116   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
117   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
118   PetscFunctionReturn(0);
119 }
120 
121 #undef __FUNCT__
122 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
123 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
124 {
125   /* This multiplication sequence is different sequence
126      than the CPU version. In particular, the diagonal block
127      multiplication kernel is launched in one stream. Then,
128      in a separate stream, the data transfers from DeviceToHost
129      (with MPI messaging in between), then HostToDevice are
130      launched. Once the data transfer stream is synchronized,
131      to ensure messaging is complete, the MatMultAdd kernel
132      is launched in the original (MatMult) stream to protect
133      against race conditions.
134 
135      This sequence should only be called for GPU computation. */
136   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
137   PetscErrorCode ierr;
138   PetscInt       nt;
139 
140   PetscFunctionBegin;
141   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
142   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
143   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
144   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
145   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
146   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
147   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
148   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
149   PetscFunctionReturn(0);
150 }
151 
152 #undef __FUNCT__
153 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
154 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
155 {
156   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
157   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
158 
159   PetscFunctionBegin;
160   switch (op) {
161   case MAT_CUSPARSE_MULT_DIAG:
162     cusparseStruct->diagGPUMatFormat = format;
163     break;
164   case MAT_CUSPARSE_MULT_OFFDIAG:
165     cusparseStruct->offdiagGPUMatFormat = format;
166     break;
167   case MAT_CUSPARSE_ALL:
168     cusparseStruct->diagGPUMatFormat    = format;
169     cusparseStruct->offdiagGPUMatFormat = format;
170     break;
171   default:
172     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
179 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
180 {
181   MatCUSPARSEStorageFormat format;
182   PetscErrorCode           ierr;
183   PetscBool                flg;
184   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
185   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
186 
187   PetscFunctionBegin;
188   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
189   ierr = PetscObjectOptionsBegin((PetscObject)A);
190   if (A->factortype==MAT_FACTOR_NONE) {
191     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
192                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
193     if (flg) {
194       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
195     }
196     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
197                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
198     if (flg) {
199       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
200     }
201     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
202                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
203     if (flg) {
204       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
205     }
206   }
207   ierr = PetscOptionsEnd();CHKERRQ(ierr);
208   PetscFunctionReturn(0);
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
213 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
214 {
215   PetscErrorCode     ierr;
216   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
217   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
218   cudaError_t        err;
219   cusparseStatus_t   stat;
220 
221   PetscFunctionBegin;
222   try {
223     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
224     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
225     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat);
226     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err);
227     delete cusparseStruct;
228   } catch(char *ex) {
229     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
230   }
231   cusparseStruct = 0;
232 
233   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
234   PetscFunctionReturn(0);
235 }
236 
237 #undef __FUNCT__
238 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
239 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
240 {
241   PetscErrorCode     ierr;
242   Mat_MPIAIJ         *a;
243   Mat_MPIAIJCUSPARSE * cusparseStruct;
244   cudaError_t        err;
245   cusparseStatus_t   stat;
246 
247   PetscFunctionBegin;
248   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
249   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
250   a        = (Mat_MPIAIJ*)A->data;
251   a->spptr = new Mat_MPIAIJCUSPARSE;
252 
253   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
254   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
255   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
256   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat);
257   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err);
258 
259   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
260   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
261   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
262   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
263   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
264 
265   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
266   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
267   PetscFunctionReturn(0);
268 }
269 
270 /*@
271    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
272    (the default parallel PETSc format).  This matrix will ultimately pushed down
273    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
274    assembly performance the user should preallocate the matrix storage by setting
275    the parameter nz (or the array nnz).  By setting these parameters accurately,
276    performance during matrix assembly can be increased by more than a factor of 50.
277 
278    Collective on MPI_Comm
279 
280    Input Parameters:
281 +  comm - MPI communicator, set to PETSC_COMM_SELF
282 .  m - number of rows
283 .  n - number of columns
284 .  nz - number of nonzeros per row (same for all rows)
285 -  nnz - array containing the number of nonzeros in the various rows
286          (possibly different for each row) or NULL
287 
288    Output Parameter:
289 .  A - the matrix
290 
291    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
292    MatXXXXSetPreallocation() paradigm instead of this routine directly.
293    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
294 
295    Notes:
296    If nnz is given then nz is ignored
297 
298    The AIJ format (also called the Yale sparse matrix format or
299    compressed row storage), is fully compatible with standard Fortran 77
300    storage.  That is, the stored row and column indices can begin at
301    either one (as in Fortran) or zero.  See the users' manual for details.
302 
303    Specify the preallocated storage with either nz or nnz (not both).
304    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
305    allocation.  For large problems you MUST preallocate memory or you
306    will get TERRIBLE performance, see the users' manual chapter on matrices.
307 
308    By default, this format uses inodes (identical nodes) when possible, to
309    improve numerical efficiency of matrix-vector products and solves. We
310    search for consecutive rows with the same nonzero structure, thereby
311    reusing matrix information to achieve increased efficiency.
312 
313    Level: intermediate
314 
315 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
316 @*/
317 #undef __FUNCT__
318 #define __FUNCT__ "MatCreateAIJCUSPARSE"
319 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
320 {
321   PetscErrorCode ierr;
322   PetscMPIInt    size;
323 
324   PetscFunctionBegin;
325   ierr = MatCreate(comm,A);CHKERRQ(ierr);
326   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
327   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
328   if (size > 1) {
329     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
330     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
331   } else {
332     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
333     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
334   }
335   PetscFunctionReturn(0);
336 }
337 
338 /*M
339    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
340 
341    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
342    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
343    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
344 
345    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
346    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
347    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
348    for communicators controlling multiple processes.  It is recommended that you call both of
349    the above preallocation routines for simplicity.
350 
351    Options Database Keys:
352 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
353 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
354 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
355 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
356 
357   Level: beginner
358 
359  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
360 M
361 M*/
362