xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision faf31966beb4ad7a9cebcc2f39db0e6564e44ad0)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
19   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
20   if (d_nnz) {
21     for (i=0; i<B->rmap->n; i++) {
22       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23     }
24   }
25   if (o_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28     }
29   }
30   if (!B->preallocated) {
31     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
34     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
35     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
36     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
37     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
38     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
39     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
40   }
41   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
42   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
49 
50   B->preallocated = PETSC_TRUE;
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE"
56 PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
57 {
58   PetscErrorCode ierr;
59   PetscInt rbs,cbs;
60 
61   PetscFunctionBegin;
62   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
63   if (right) {
64     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
65     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
66     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
67     ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr);
68     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
69   }
70   if (left) {
71     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
72     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
73     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
74     ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr);
75     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
76 
77 
78   }
79   PetscFunctionReturn(0);
80 }
81 
82 
83 #undef __FUNCT__
84 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
85 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
86 {
87   /* This multiplication sequence is different sequence
88      than the CPU version. In particular, the diagonal block
89      multiplication kernel is launched in one stream. Then,
90      in a separate stream, the data transfers from DeviceToHost
91      (with MPI messaging in between), then HostToDevice are
92      launched. Once the data transfer stream is synchronized,
93      to ensure messaging is complete, the MatMultAdd kernel
94      is launched in the original (MatMult) stream to protect
95      against race conditions.
96 
97      This sequence should only be called for GPU computation. */
98   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
99   PetscErrorCode ierr;
100   PetscInt       nt;
101 
102   PetscFunctionBegin;
103   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
104   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
105   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
107   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
109   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
110   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
111   PetscFunctionReturn(0);
112 }
113 
114 #undef __FUNCT__
115 #define __FUNCT__ "MatMultTranspose_MPIAIJCUSPARSE"
116 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
117 {
118   /* This multiplication sequence is different sequence
119      than the CPU version. In particular, the diagonal block
120      multiplication kernel is launched in one stream. Then,
121      in a separate stream, the data transfers from DeviceToHost
122      (with MPI messaging in between), then HostToDevice are
123      launched. Once the data transfer stream is synchronized,
124      to ensure messaging is complete, the MatMultAdd kernel
125      is launched in the original (MatMult) stream to protect
126      against race conditions.
127 
128      This sequence should only be called for GPU computation. */
129   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
130   PetscErrorCode ierr;
131   PetscInt       nt;
132 
133   PetscFunctionBegin;
134   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
135   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
136   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
138   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
139   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
140   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
141   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
142   PetscFunctionReturn(0);
143 }
144 
145 #undef __FUNCT__
146 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
147 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
148 {
149   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
150   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
151 
152   PetscFunctionBegin;
153   switch (op) {
154   case MAT_CUSPARSE_MULT_DIAG:
155     cusparseStruct->diagGPUMatFormat = format;
156     break;
157   case MAT_CUSPARSE_MULT_OFFDIAG:
158     cusparseStruct->offdiagGPUMatFormat = format;
159     break;
160   case MAT_CUSPARSE_ALL:
161     cusparseStruct->diagGPUMatFormat    = format;
162     cusparseStruct->offdiagGPUMatFormat = format;
163     break;
164   default:
165     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
166   }
167   PetscFunctionReturn(0);
168 }
169 
170 #undef __FUNCT__
171 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
172 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
173 {
174   MatCUSPARSEStorageFormat format;
175   PetscErrorCode           ierr;
176   PetscBool                flg;
177   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
178   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
179 
180   PetscFunctionBegin;
181   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
182   ierr = PetscObjectOptionsBegin((PetscObject)A);
183   if (A->factortype==MAT_FACTOR_NONE) {
184     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
185                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186     if (flg) {
187       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
188     }
189     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
190                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
191     if (flg) {
192       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
193     }
194     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
195                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
196     if (flg) {
197       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
198     }
199   }
200   ierr = PetscOptionsEnd();CHKERRQ(ierr);
201   PetscFunctionReturn(0);
202 }
203 
204 #undef __FUNCT__
205 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
206 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
207 {
208   PetscErrorCode     ierr;
209   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
210   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
211   cudaError_t        err;
212   cusparseStatus_t   stat;
213 
214   PetscFunctionBegin;
215   try {
216     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
217     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
218     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
219     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
220     delete cusparseStruct;
221   } catch(char *ex) {
222     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
223   }
224   cusparseStruct = 0;
225 
226   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 #undef __FUNCT__
231 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
232 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
233 {
234   PetscErrorCode     ierr;
235   Mat_MPIAIJ         *a;
236   Mat_MPIAIJCUSPARSE * cusparseStruct;
237   cudaError_t        err;
238   cusparseStatus_t   stat;
239 
240   PetscFunctionBegin;
241   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
242   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
243   a        = (Mat_MPIAIJ*)A->data;
244   a->spptr = new Mat_MPIAIJCUSPARSE;
245 
246   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
247   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
248   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
249   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
250   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
251 
252   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
253   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
254   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
255   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
256   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
257 
258   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
259   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
260   PetscFunctionReturn(0);
261 }
262 
263 /*@
264    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
265    (the default parallel PETSc format).  This matrix will ultimately pushed down
266    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
267    assembly performance the user should preallocate the matrix storage by setting
268    the parameter nz (or the array nnz).  By setting these parameters accurately,
269    performance during matrix assembly can be increased by more than a factor of 50.
270 
271    Collective on MPI_Comm
272 
273    Input Parameters:
274 +  comm - MPI communicator, set to PETSC_COMM_SELF
275 .  m - number of rows
276 .  n - number of columns
277 .  nz - number of nonzeros per row (same for all rows)
278 -  nnz - array containing the number of nonzeros in the various rows
279          (possibly different for each row) or NULL
280 
281    Output Parameter:
282 .  A - the matrix
283 
284    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
285    MatXXXXSetPreallocation() paradigm instead of this routine directly.
286    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
287 
288    Notes:
289    If nnz is given then nz is ignored
290 
291    The AIJ format (also called the Yale sparse matrix format or
292    compressed row storage), is fully compatible with standard Fortran 77
293    storage.  That is, the stored row and column indices can begin at
294    either one (as in Fortran) or zero.  See the users' manual for details.
295 
296    Specify the preallocated storage with either nz or nnz (not both).
297    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
298    allocation.  For large problems you MUST preallocate memory or you
299    will get TERRIBLE performance, see the users' manual chapter on matrices.
300 
301    By default, this format uses inodes (identical nodes) when possible, to
302    improve numerical efficiency of matrix-vector products and solves. We
303    search for consecutive rows with the same nonzero structure, thereby
304    reusing matrix information to achieve increased efficiency.
305 
306    Level: intermediate
307 
308 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
309 @*/
310 #undef __FUNCT__
311 #define __FUNCT__ "MatCreateAIJCUSPARSE"
312 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
313 {
314   PetscErrorCode ierr;
315   PetscMPIInt    size;
316 
317   PetscFunctionBegin;
318   ierr = MatCreate(comm,A);CHKERRQ(ierr);
319   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
320   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
321   if (size > 1) {
322     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
323     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
324   } else {
325     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
326     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
327   }
328   PetscFunctionReturn(0);
329 }
330 
331 /*M
332    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
333 
334    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
335    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
336    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
337 
338    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
339    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
340    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
341    for communicators controlling multiple processes.  It is recommended that you call both of
342    the above preallocation routines for simplicity.
343 
344    Options Database Keys:
345 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
346 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
347 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
348 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
349 
350   Level: beginner
351 
352  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
353 M
354 M*/
355