xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 34136279daf4c5803e26cd9ecf5a2cf0d75aa7fe)
1 #define PETSC_SKIP_SPINLOCK
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
6 
7 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8 {
9   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11   PetscErrorCode     ierr;
12   PetscInt           i;
13 
14   PetscFunctionBegin;
15   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
16   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
17   if (d_nnz) {
18     for (i=0; i<B->rmap->n; i++) {
19       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
20     }
21   }
22   if (o_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
25     }
26   }
27   if (!B->preallocated) {
28     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
29     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
30     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
31     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
33     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
35     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
37   }
38   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
39   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
40   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
41   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
42   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
43   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
44   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
46 
47   B->preallocated = PETSC_TRUE;
48   PetscFunctionReturn(0);
49 }
50 
51 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
52 {
53   /* This multiplication sequence is different sequence
54      than the CPU version. In particular, the diagonal block
55      multiplication kernel is launched in one stream. Then,
56      in a separate stream, the data transfers from DeviceToHost
57      (with MPI messaging in between), then HostToDevice are
58      launched. Once the data transfer stream is synchronized,
59      to ensure messaging is complete, the MatMultAdd kernel
60      is launched in the original (MatMult) stream to protect
61      against race conditions.
62 
63      This sequence should only be called for GPU computation. */
64   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       nt;
67 
68   PetscFunctionBegin;
69   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
70   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
71   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
72   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
73   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
74   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
75   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
76   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
77   PetscFunctionReturn(0);
78 }
79 
80 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
81 {
82   /* This multiplication sequence is different sequence
83      than the CPU version. In particular, the diagonal block
84      multiplication kernel is launched in one stream. Then,
85      in a separate stream, the data transfers from DeviceToHost
86      (with MPI messaging in between), then HostToDevice are
87      launched. Once the data transfer stream is synchronized,
88      to ensure messaging is complete, the MatMultAdd kernel
89      is launched in the original (MatMult) stream to protect
90      against race conditions.
91 
92      This sequence should only be called for GPU computation. */
93   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
94   PetscErrorCode ierr;
95   PetscInt       nt;
96 
97   PetscFunctionBegin;
98   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
99   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
100   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr);
101   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
102   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
103   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
104   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
105   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
110 {
111   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
112   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
113 
114   PetscFunctionBegin;
115   switch (op) {
116   case MAT_CUSPARSE_MULT_DIAG:
117     cusparseStruct->diagGPUMatFormat = format;
118     break;
119   case MAT_CUSPARSE_MULT_OFFDIAG:
120     cusparseStruct->offdiagGPUMatFormat = format;
121     break;
122   case MAT_CUSPARSE_ALL:
123     cusparseStruct->diagGPUMatFormat    = format;
124     cusparseStruct->offdiagGPUMatFormat = format;
125     break;
126   default:
127     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
128   }
129   PetscFunctionReturn(0);
130 }
131 
132 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
133 {
134   MatCUSPARSEStorageFormat format;
135   PetscErrorCode           ierr;
136   PetscBool                flg;
137   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
138   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
139 
140   PetscFunctionBegin;
141   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
142   ierr = PetscObjectOptionsBegin((PetscObject)A);
143   if (A->factortype==MAT_FACTOR_NONE) {
144     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
145                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
146     if (flg) {
147       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
148     }
149     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
150                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
151     if (flg) {
152       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
153     }
154     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
155                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
156     if (flg) {
157       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
158     }
159   }
160   ierr = PetscOptionsEnd();CHKERRQ(ierr);
161   PetscFunctionReturn(0);
162 }
163 
164 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
165 {
166   PetscErrorCode ierr;
167   Mat_MPIAIJ     *mpiaij;
168 
169   PetscFunctionBegin;
170   mpiaij = (Mat_MPIAIJ*)A->data;
171   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
172   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
173     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
174   }
175   PetscFunctionReturn(0);
176 }
177 
178 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
179 {
180   PetscErrorCode     ierr;
181   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
182   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
183   cudaError_t        err;
184   cusparseStatus_t   stat;
185 
186   PetscFunctionBegin;
187   try {
188     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
189     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
190     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
191     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
192     delete cusparseStruct;
193   } catch(char *ex) {
194     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
195   }
196   cusparseStruct = 0;
197 
198   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
199   PetscFunctionReturn(0);
200 }
201 
202 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
203 {
204   PetscErrorCode     ierr;
205   Mat_MPIAIJ         *a;
206   Mat_MPIAIJCUSPARSE * cusparseStruct;
207   cudaError_t        err;
208   cusparseStatus_t   stat;
209 
210   PetscFunctionBegin;
211   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
212   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
213   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
214   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
215 
216   a        = (Mat_MPIAIJ*)A->data;
217   a->spptr = new Mat_MPIAIJCUSPARSE;
218 
219   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
220   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
221   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
222   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
223   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
224 
225   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
226   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
227   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
228   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
229   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
230 
231   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
232   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
233   PetscFunctionReturn(0);
234 }
235 
236 /*@
237    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
238    (the default parallel PETSc format).  This matrix will ultimately pushed down
239    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
240    assembly performance the user should preallocate the matrix storage by setting
241    the parameter nz (or the array nnz).  By setting these parameters accurately,
242    performance during matrix assembly can be increased by more than a factor of 50.
243 
244    Collective on MPI_Comm
245 
246    Input Parameters:
247 +  comm - MPI communicator, set to PETSC_COMM_SELF
248 .  m - number of rows
249 .  n - number of columns
250 .  nz - number of nonzeros per row (same for all rows)
251 -  nnz - array containing the number of nonzeros in the various rows
252          (possibly different for each row) or NULL
253 
254    Output Parameter:
255 .  A - the matrix
256 
257    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
258    MatXXXXSetPreallocation() paradigm instead of this routine directly.
259    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
260 
261    Notes:
262    If nnz is given then nz is ignored
263 
264    The AIJ format (also called the Yale sparse matrix format or
265    compressed row storage), is fully compatible with standard Fortran 77
266    storage.  That is, the stored row and column indices can begin at
267    either one (as in Fortran) or zero.  See the users' manual for details.
268 
269    Specify the preallocated storage with either nz or nnz (not both).
270    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
271    allocation.  For large problems you MUST preallocate memory or you
272    will get TERRIBLE performance, see the users' manual chapter on matrices.
273 
274    By default, this format uses inodes (identical nodes) when possible, to
275    improve numerical efficiency of matrix-vector products and solves. We
276    search for consecutive rows with the same nonzero structure, thereby
277    reusing matrix information to achieve increased efficiency.
278 
279    Level: intermediate
280 
281 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
282 @*/
283 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
284 {
285   PetscErrorCode ierr;
286   PetscMPIInt    size;
287 
288   PetscFunctionBegin;
289   ierr = MatCreate(comm,A);CHKERRQ(ierr);
290   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
291   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
292   if (size > 1) {
293     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
294     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
295   } else {
296     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
297     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
298   }
299   PetscFunctionReturn(0);
300 }
301 
302 /*MC
303    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
304 
305    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
306    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
307    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
308 
309    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
310    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
311    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
312    for communicators controlling multiple processes.  It is recommended that you call both of
313    the above preallocation routines for simplicity.
314 
315    Options Database Keys:
316 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
317 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
318 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
319 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
320 
321   Level: beginner
322 
323  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
324 M
325 M*/
326