xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision bef158480efac06de457f7a665168877ab3c2fd7)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
19   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
20   if (d_nnz) {
21     for (i=0; i<B->rmap->n; i++) {
22       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23     }
24   }
25   if (o_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28     }
29   }
30   if (!B->preallocated) {
31     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
35     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
37     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
38     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
40     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
42   }
43   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
44   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
51 
52   B->preallocated = PETSC_TRUE;
53   PetscFunctionReturn(0);
54 }
55 
56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
57 {
58   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
59   PetscErrorCode ierr;
60   PetscInt       nt;
61 
62   PetscFunctionBegin;
63   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
64   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
65   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
66   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
67   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
68   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
69   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
70   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
75 {
76   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
77   PetscErrorCode ierr;
78   PetscInt       nt;
79 
80   PetscFunctionBegin;
81   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
82   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
83   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
84   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
85   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
86   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
87   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
88   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
89   PetscFunctionReturn(0);
90 }
91 
92 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
93 {
94   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
95   PetscErrorCode ierr;
96   PetscInt       nt;
97 
98   PetscFunctionBegin;
99   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
100   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
101   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
102   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
103   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
104   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
105   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
106   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
107   PetscFunctionReturn(0);
108 }
109 
110 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
111 {
112   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
113   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
114 
115   PetscFunctionBegin;
116   switch (op) {
117   case MAT_CUSPARSE_MULT_DIAG:
118     cusparseStruct->diagGPUMatFormat = format;
119     break;
120   case MAT_CUSPARSE_MULT_OFFDIAG:
121     cusparseStruct->offdiagGPUMatFormat = format;
122     break;
123   case MAT_CUSPARSE_ALL:
124     cusparseStruct->diagGPUMatFormat    = format;
125     cusparseStruct->offdiagGPUMatFormat = format;
126     break;
127   default:
128     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
129   }
130   PetscFunctionReturn(0);
131 }
132 
133 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
134 {
135   MatCUSPARSEStorageFormat format;
136   PetscErrorCode           ierr;
137   PetscBool                flg;
138   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
139   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
140 
141   PetscFunctionBegin;
142   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
143   if (A->factortype==MAT_FACTOR_NONE) {
144     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
145                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
146     if (flg) {
147       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
148     }
149     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
150                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
151     if (flg) {
152       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
153     }
154     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
155                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
156     if (flg) {
157       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
158     }
159   }
160   ierr = PetscOptionsTail();CHKERRQ(ierr);
161   PetscFunctionReturn(0);
162 }
163 
164 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
165 {
166   PetscErrorCode ierr;
167   Mat_MPIAIJ     *mpiaij;
168 
169   PetscFunctionBegin;
170   mpiaij = (Mat_MPIAIJ*)A->data;
171   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
172   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
173     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
174   }
175   PetscFunctionReturn(0);
176 }
177 
178 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
179 {
180   PetscErrorCode     ierr;
181   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
182   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
183   cudaError_t        err;
184   cusparseStatus_t   stat;
185 
186   PetscFunctionBegin;
187   try {
188     if (a->A) { ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); }
189     if (a->B) { ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); }
190     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
191     if (cusparseStruct->stream) {
192       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
193     }
194     delete cusparseStruct;
195   } catch(char *ex) {
196     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
197   }
198   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
199   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
200   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
201   PetscFunctionReturn(0);
202 }
203 
204 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
205 {
206   PetscErrorCode     ierr;
207   Mat_MPIAIJ         *a;
208   Mat_MPIAIJCUSPARSE *cusparseStruct;
209   cusparseStatus_t   stat;
210   Mat                A;
211 
212   PetscFunctionBegin;
213   if (reuse == MAT_INITIAL_MATRIX) {
214     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
215   } else if (reuse == MAT_REUSE_MATRIX) {
216     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
217   }
218   A = *newmat;
219 
220   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
221   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
222 
223   a = (Mat_MPIAIJ*)A->data;
224   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
225     a->spptr = new Mat_MPIAIJCUSPARSE;
226 
227     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
228     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
229     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
230     cusparseStruct->stream              = 0;
231     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
232   }
233 
234   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
235   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
236   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
237   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
238   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
239   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
240 
241   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
242   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
243   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
248 {
249   PetscErrorCode ierr;
250 
251   PetscFunctionBegin;
252   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
253   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
254   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
255   PetscFunctionReturn(0);
256 }
257 
258 /*@
259    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
260    (the default parallel PETSc format).  This matrix will ultimately pushed down
261    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
262    assembly performance the user should preallocate the matrix storage by setting
263    the parameter nz (or the array nnz).  By setting these parameters accurately,
264    performance during matrix assembly can be increased by more than a factor of 50.
265 
266    Collective
267 
268    Input Parameters:
269 +  comm - MPI communicator, set to PETSC_COMM_SELF
270 .  m - number of rows
271 .  n - number of columns
272 .  nz - number of nonzeros per row (same for all rows)
273 -  nnz - array containing the number of nonzeros in the various rows
274          (possibly different for each row) or NULL
275 
276    Output Parameter:
277 .  A - the matrix
278 
279    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
280    MatXXXXSetPreallocation() paradigm instead of this routine directly.
281    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
282 
283    Notes:
284    If nnz is given then nz is ignored
285 
286    The AIJ format (also called the Yale sparse matrix format or
287    compressed row storage), is fully compatible with standard Fortran 77
288    storage.  That is, the stored row and column indices can begin at
289    either one (as in Fortran) or zero.  See the users' manual for details.
290 
291    Specify the preallocated storage with either nz or nnz (not both).
292    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
293    allocation.  For large problems you MUST preallocate memory or you
294    will get TERRIBLE performance, see the users' manual chapter on matrices.
295 
296    By default, this format uses inodes (identical nodes) when possible, to
297    improve numerical efficiency of matrix-vector products and solves. We
298    search for consecutive rows with the same nonzero structure, thereby
299    reusing matrix information to achieve increased efficiency.
300 
301    Level: intermediate
302 
303 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
304 @*/
305 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
306 {
307   PetscErrorCode ierr;
308   PetscMPIInt    size;
309 
310   PetscFunctionBegin;
311   ierr = MatCreate(comm,A);CHKERRQ(ierr);
312   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
313   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
314   if (size > 1) {
315     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
316     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
317   } else {
318     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
319     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
320   }
321   PetscFunctionReturn(0);
322 }
323 
324 /*MC
325    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
326 
327    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
328    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
329    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
330 
331    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
332    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
333    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
334    for communicators controlling multiple processes.  It is recommended that you call both of
335    the above preallocation routines for simplicity.
336 
337    Options Database Keys:
338 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
339 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
340 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
341 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
342 
343   Level: beginner
344 
345  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
346 M
347 M*/
348