xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 70baa948e42e6bb1660071283af10f413c9269b6)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
19   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
20   if (d_nnz) {
21     for (i=0; i<B->rmap->n; i++) {
22       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23     }
24   }
25   if (o_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28     }
29   }
30   if (!B->preallocated) {
31     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
34     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
35     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
36     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
37     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
38     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
39     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
40   }
41   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
42   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
49 
50   B->preallocated = PETSC_TRUE;
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE"
56 PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
57 {
58   PetscErrorCode ierr;
59   PetscInt rbs,cbs;
60 
61   PetscFunctionBegin;
62   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
63   if (right) {
64     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
65     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
66     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
67     ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr);
68     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
69   }
70   if (left) {
71     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
72     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
73     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
74     ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr);
75     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
76 
77 
78   }
79   PetscFunctionReturn(0);
80 }
81 
82 #undef __FUNCT__
83 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
84 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
85 {
86   /* This multiplication sequence is different sequence
87      than the CPU version. In particular, the diagonal block
88      multiplication kernel is launched in one stream. Then,
89      in a separate stream, the data transfers from DeviceToHost
90      (with MPI messaging in between), then HostToDevice are
91      launched. Once the data transfer stream is synchronized,
92      to ensure messaging is complete, the MatMultAdd kernel
93      is launched in the original (MatMult) stream to protect
94      against race conditions.
95 
96      This sequence should only be called for GPU computation. */
97   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
98   PetscErrorCode ierr;
99   PetscInt       nt;
100 
101   PetscFunctionBegin;
102   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
103   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
104   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
106   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
107   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
109   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
110   PetscFunctionReturn(0);
111 }
112 
113 #undef __FUNCT__
114 #define __FUNCT__ "MatMultTranspose_MPIAIJCUSPARSE"
115 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
116 {
117   /* This multiplication sequence is different sequence
118      than the CPU version. In particular, the diagonal block
119      multiplication kernel is launched in one stream. Then,
120      in a separate stream, the data transfers from DeviceToHost
121      (with MPI messaging in between), then HostToDevice are
122      launched. Once the data transfer stream is synchronized,
123      to ensure messaging is complete, the MatMultAdd kernel
124      is launched in the original (MatMult) stream to protect
125      against race conditions.
126 
127      This sequence should only be called for GPU computation. */
128   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
129   PetscErrorCode ierr;
130   PetscInt       nt;
131 
132   PetscFunctionBegin;
133   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
134   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
135   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
136   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
137   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
138   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
139   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
140   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
141   PetscFunctionReturn(0);
142 }
143 
144 #undef __FUNCT__
145 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
146 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
147 {
148   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
149   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
150 
151   PetscFunctionBegin;
152   switch (op) {
153   case MAT_CUSPARSE_MULT_DIAG:
154     cusparseStruct->diagGPUMatFormat = format;
155     break;
156   case MAT_CUSPARSE_MULT_OFFDIAG:
157     cusparseStruct->offdiagGPUMatFormat = format;
158     break;
159   case MAT_CUSPARSE_ALL:
160     cusparseStruct->diagGPUMatFormat    = format;
161     cusparseStruct->offdiagGPUMatFormat = format;
162     break;
163   default:
164     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
165   }
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
171 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   MatCUSPARSEStorageFormat format;
174   PetscErrorCode           ierr;
175   PetscBool                flg;
176   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
177   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
178 
179   PetscFunctionBegin;
180   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
181   ierr = PetscObjectOptionsBegin((PetscObject)A);
182   if (A->factortype==MAT_FACTOR_NONE) {
183     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
184                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185     if (flg) {
186       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
187     }
188     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
189                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190     if (flg) {
191       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
192     }
193     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
194                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
195     if (flg) {
196       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
197     }
198   }
199   ierr = PetscOptionsEnd();CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 PETSC_EXTERN PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat,MatAssemblyType);
204 
205 #undef __FUNCT__
206 #define __FUNCT__ "MatAssemblyEnd_MPIAIJCUSPARSE"
207 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
208 {
209   PetscErrorCode ierr;
210   Mat_MPIAIJ     *mpiaij;
211 
212   PetscFunctionBegin;
213   mpiaij = (Mat_MPIAIJ*)A->data;
214   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
215   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
216     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
217   }
218   PetscFunctionReturn(0);
219 }
220 
221 #undef __FUNCT__
222 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
223 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
224 {
225   PetscErrorCode     ierr;
226   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
227   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
228   cudaError_t        err;
229   cusparseStatus_t   stat;
230 
231   PetscFunctionBegin;
232   try {
233     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
234     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
235     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
236     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
237     delete cusparseStruct;
238   } catch(char *ex) {
239     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
240   }
241   cusparseStruct = 0;
242 
243   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNCT__
248 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
249 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
250 {
251   PetscErrorCode     ierr;
252   Mat_MPIAIJ         *a;
253   Mat_MPIAIJCUSPARSE * cusparseStruct;
254   cudaError_t        err;
255   cusparseStatus_t   stat;
256 
257   PetscFunctionBegin;
258   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
259   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
260   a        = (Mat_MPIAIJ*)A->data;
261   a->spptr = new Mat_MPIAIJCUSPARSE;
262 
263   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
264   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
265   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
266   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
267   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
268 
269   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
270   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
271   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
272   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
273   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
274   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
275 
276   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
277   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 /*@
282    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
283    (the default parallel PETSc format).  This matrix will ultimately pushed down
284    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
285    assembly performance the user should preallocate the matrix storage by setting
286    the parameter nz (or the array nnz).  By setting these parameters accurately,
287    performance during matrix assembly can be increased by more than a factor of 50.
288 
289    Collective on MPI_Comm
290 
291    Input Parameters:
292 +  comm - MPI communicator, set to PETSC_COMM_SELF
293 .  m - number of rows
294 .  n - number of columns
295 .  nz - number of nonzeros per row (same for all rows)
296 -  nnz - array containing the number of nonzeros in the various rows
297          (possibly different for each row) or NULL
298 
299    Output Parameter:
300 .  A - the matrix
301 
302    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
303    MatXXXXSetPreallocation() paradigm instead of this routine directly.
304    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
305 
306    Notes:
307    If nnz is given then nz is ignored
308 
309    The AIJ format (also called the Yale sparse matrix format or
310    compressed row storage), is fully compatible with standard Fortran 77
311    storage.  That is, the stored row and column indices can begin at
312    either one (as in Fortran) or zero.  See the users' manual for details.
313 
314    Specify the preallocated storage with either nz or nnz (not both).
315    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
316    allocation.  For large problems you MUST preallocate memory or you
317    will get TERRIBLE performance, see the users' manual chapter on matrices.
318 
319    By default, this format uses inodes (identical nodes) when possible, to
320    improve numerical efficiency of matrix-vector products and solves. We
321    search for consecutive rows with the same nonzero structure, thereby
322    reusing matrix information to achieve increased efficiency.
323 
324    Level: intermediate
325 
326 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
327 @*/
328 #undef __FUNCT__
329 #define __FUNCT__ "MatCreateAIJCUSPARSE"
330 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
331 {
332   PetscErrorCode ierr;
333   PetscMPIInt    size;
334 
335   PetscFunctionBegin;
336   ierr = MatCreate(comm,A);CHKERRQ(ierr);
337   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
338   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
339   if (size > 1) {
340     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
341     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
342   } else {
343     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
344     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
345   }
346   PetscFunctionReturn(0);
347 }
348 
349 /*M
350    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
351 
352    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
353    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
354    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
355 
356    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
357    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
358    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
359    for communicators controlling multiple processes.  It is recommended that you call both of
360    the above preallocation routines for simplicity.
361 
362    Options Database Keys:
363 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
364 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
365 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
366 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
367 
368   Level: beginner
369 
370  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
371 M
372 M*/
373