xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 52030a5ef06b2893ca16b812ed5ce24d356d0401)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 
10 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
11 
12 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
13 {
14   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
15   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
16   PetscErrorCode     ierr;
17   PetscInt           i;
18 
19   PetscFunctionBegin;
20   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
21   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
22   if (d_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
25     }
26   }
27   if (o_nnz) {
28     for (i=0; i<B->rmap->n; i++) {
29       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
30     }
31   }
32   if (!B->preallocated) {
33     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
34     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
35     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
36     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
37     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
38     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
39     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
40     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
41     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
42     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
43     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
44   }
45   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
46   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
51   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
52   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
53 
54   B->preallocated = PETSC_TRUE;
55   PetscFunctionReturn(0);
56 }
57 
58 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
59 {
60   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
61   PetscErrorCode ierr;
62   PetscInt       nt;
63 
64   PetscFunctionBegin;
65   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
66   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
67   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
68   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
69   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
70   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
71   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
72   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
77 {
78   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
79   PetscErrorCode ierr;
80   PetscInt       nt;
81 
82   PetscFunctionBegin;
83   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
84   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
85   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
86   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
87   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
88   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
89   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
90   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
95 {
96   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
97   PetscErrorCode ierr;
98   PetscInt       nt;
99 
100   PetscFunctionBegin;
101   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
102   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
103   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
104   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
105   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
106   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
107   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
108   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
113 {
114   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
115   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
116 
117   PetscFunctionBegin;
118   switch (op) {
119   case MAT_CUSPARSE_MULT_DIAG:
120     cusparseStruct->diagGPUMatFormat = format;
121     break;
122   case MAT_CUSPARSE_MULT_OFFDIAG:
123     cusparseStruct->offdiagGPUMatFormat = format;
124     break;
125   case MAT_CUSPARSE_ALL:
126     cusparseStruct->diagGPUMatFormat    = format;
127     cusparseStruct->offdiagGPUMatFormat = format;
128     break;
129   default:
130     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
131   }
132   PetscFunctionReturn(0);
133 }
134 
135 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
136 {
137   MatCUSPARSEStorageFormat format;
138   PetscErrorCode           ierr;
139   PetscBool                flg;
140   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
141   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
142 
143   PetscFunctionBegin;
144   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
145   if (A->factortype==MAT_FACTOR_NONE) {
146     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
147                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
148     if (flg) {
149       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
150     }
151     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
152                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
153     if (flg) {
154       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
155     }
156     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
157                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
158     if (flg) {
159       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
160     }
161   }
162   ierr = PetscOptionsTail();CHKERRQ(ierr);
163   PetscFunctionReturn(0);
164 }
165 
166 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
167 {
168   PetscErrorCode ierr;
169   Mat_MPIAIJ     *mpiaij;
170 
171   PetscFunctionBegin;
172   mpiaij = (Mat_MPIAIJ*)A->data;
173   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
174   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
175     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
176   }
177   PetscFunctionReturn(0);
178 }
179 
180 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
181 {
182   PetscErrorCode     ierr;
183   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
184   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
185   cudaError_t        err;
186   cusparseStatus_t   stat;
187 
188   PetscFunctionBegin;
189   try {
190     if (a->A) { ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); }
191     if (a->B) { ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); }
192     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
193     if (cusparseStruct->stream) {
194       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
195     }
196     delete cusparseStruct;
197   } catch(char *ex) {
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
199   }
200   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
201   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
202   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
203   PetscFunctionReturn(0);
204 }
205 
206 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
207 {
208   PetscErrorCode     ierr;
209   Mat_MPIAIJ         *a;
210   Mat_MPIAIJCUSPARSE *cusparseStruct;
211   cusparseStatus_t   stat;
212   Mat                A;
213 
214   PetscFunctionBegin;
215   if (reuse == MAT_INITIAL_MATRIX) {
216     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
217   } else if (reuse == MAT_REUSE_MATRIX) {
218     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
219   }
220   A = *newmat;
221 
222   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
223   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
224 
225   a = (Mat_MPIAIJ*)A->data;
226   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
227     a->spptr = new Mat_MPIAIJCUSPARSE;
228 
229     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
230     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
231     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
232     cusparseStruct->stream              = 0;
233     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
234   }
235 
236   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
237   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
238   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
239   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
240   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
241   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
242 
243   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
244   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
245   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
246   PetscFunctionReturn(0);
247 }
248 
249 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
250 {
251   PetscErrorCode ierr;
252 
253   PetscFunctionBegin;
254   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
255   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
256   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
262    (the default parallel PETSc format).  This matrix will ultimately pushed down
263    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
264    assembly performance the user should preallocate the matrix storage by setting
265    the parameter nz (or the array nnz).  By setting these parameters accurately,
266    performance during matrix assembly can be increased by more than a factor of 50.
267 
268    Collective
269 
270    Input Parameters:
271 +  comm - MPI communicator, set to PETSC_COMM_SELF
272 .  m - number of rows
273 .  n - number of columns
274 .  nz - number of nonzeros per row (same for all rows)
275 -  nnz - array containing the number of nonzeros in the various rows
276          (possibly different for each row) or NULL
277 
278    Output Parameter:
279 .  A - the matrix
280 
281    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
282    MatXXXXSetPreallocation() paradigm instead of this routine directly.
283    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
284 
285    Notes:
286    If nnz is given then nz is ignored
287 
288    The AIJ format (also called the Yale sparse matrix format or
289    compressed row storage), is fully compatible with standard Fortran 77
290    storage.  That is, the stored row and column indices can begin at
291    either one (as in Fortran) or zero.  See the users' manual for details.
292 
293    Specify the preallocated storage with either nz or nnz (not both).
294    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
295    allocation.  For large problems you MUST preallocate memory or you
296    will get TERRIBLE performance, see the users' manual chapter on matrices.
297 
298    By default, this format uses inodes (identical nodes) when possible, to
299    improve numerical efficiency of matrix-vector products and solves. We
300    search for consecutive rows with the same nonzero structure, thereby
301    reusing matrix information to achieve increased efficiency.
302 
303    Level: intermediate
304 
305 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
306 @*/
307 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
308 {
309   PetscErrorCode ierr;
310   PetscMPIInt    size;
311 
312   PetscFunctionBegin;
313   ierr = MatCreate(comm,A);CHKERRQ(ierr);
314   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
315   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
316   if (size > 1) {
317     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
318     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
319   } else {
320     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
321     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
322   }
323   PetscFunctionReturn(0);
324 }
325 
326 /*MC
327    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
328 
329    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
330    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
331    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
332 
333    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
334    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
335    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
336    for communicators controlling multiple processes.  It is recommended that you call both of
337    the above preallocation routines for simplicity.
338 
339    Options Database Keys:
340 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
341 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
342 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
343 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
344 
345   Level: beginner
346 
347  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
348 M
349 M*/
350 #else
351 
352 /* The following stubs are only provided to satisfy the linker */
353 
354 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
355 {
356   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!");
357 }
358 #endif
359