xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 38bcdd5a967c22b143d2cc59677183600d4e9763)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
2 #include "mpicusparsematimpl.h"
3 
4 EXTERN_C_BEGIN
5 #undef __FUNCT__
6 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
7 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8 {
9   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
10   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11   PetscErrorCode ierr;
12   PetscInt       i;
13 
14   PetscFunctionBegin;
15   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
16   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
17   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
18   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
19 
20   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
21   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
22   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
23   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
24   if (d_nnz) {
25     for (i=0; i<B->rmap->n; i++) {
26       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
27     }
28   }
29   if (o_nnz) {
30     for (i=0; i<B->rmap->n; i++) {
31       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
32     }
33   }
34   if (!B->preallocated) {
35     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
36     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
37     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
38     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
39     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
40     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
41     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
42     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
43     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
44   }
45   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
46   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
47   ierr=MatSetOption_SeqAIJCUSPARSE(b->A,cusparseStruct->diagGPUMatFormat,PETSC_TRUE);CHKERRQ(ierr);
48   ierr=MatSetOption_SeqAIJCUSPARSE(b->B,cusparseStruct->offdiagGPUMatFormat,PETSC_TRUE);CHKERRQ(ierr);
49   B->preallocated = PETSC_TRUE;
50   PetscFunctionReturn(0);
51 }
52 EXTERN_C_END
53 #undef __FUNCT__
54 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
55 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
56 {
57   PetscErrorCode ierr;
58 
59   PetscFunctionBegin;
60   if (right) {
61     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
62     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
63     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
64     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
65     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
66   }
67   if (left) {
68     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
69     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
70     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
71     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
72     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
73   }
74   PetscFunctionReturn(0);
75 }
76 
77 
78 #undef __FUNCT__
79 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
80 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
81 {
82   // This multiplication sequence is different sequence
83   // than the CPU version. In particular, the diagonal block
84   // multiplication kernel is launched in one stream. Then,
85   // in a separate stream, the data transfers from DeviceToHost
86   // (with MPI messaging in between), then HostToDevice are
87   // launched. Once the data transfer stream is synchronized,
88   // to ensure messaging is complete, the MatMultAdd kernel
89   // is launched in the original (MatMult) stream to protect
90   // against race conditions.
91   //
92   // This sequence should only be called for GPU computation.
93   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
94   PetscErrorCode ierr;
95   PetscInt       nt;
96 
97   PetscFunctionBegin;
98   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
99   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
100   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
101   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
102   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
104   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
105   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 #undef __FUNCT__
110 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
111 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
112 {
113   // This multiplication sequence is different sequence
114   // than the CPU version. In particular, the diagonal block
115   // multiplication kernel is launched in one stream. Then,
116   // in a separate stream, the data transfers from DeviceToHost
117   // (with MPI messaging in between), then HostToDevice are
118   // launched. Once the data transfer stream is synchronized,
119   // to ensure messaging is complete, the MatMultAdd kernel
120   // is launched in the original (MatMult) stream to protect
121   // against race conditions.
122   //
123   // This sequence should only be called for GPU computation.
124   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
125   PetscErrorCode ierr;
126   PetscInt       nt;
127 
128   PetscFunctionBegin;
129   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
130   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
131   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
132   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
133   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
134   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
135   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
136   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
137   PetscFunctionReturn(0);
138 }
139 
140 EXTERN_C_BEGIN
141 PetscErrorCode  MatCreate_MPIAIJ(Mat);
142 EXTERN_C_END
143 //PetscErrorCode MatSetValuesBatch_MPIAIJCUSPARSE(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);
144 
145 
146 #undef __FUNCT__
147 #define __FUNCT__ "MatSetOption_MPIAIJCUSPARSE"
148 PetscErrorCode MatSetOption_MPIAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
149 {
150   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
151   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
152   PetscErrorCode ierr;
153 
154   PetscFunctionBegin;
155   ierr = MatSetOption_MPIAIJ(A,op,flg);CHKERRQ(ierr);
156   switch (op) {
157   case MAT_DIAGBLOCK_CSR:
158     cusparseStruct->diagGPUMatFormat = MAT_DIAGBLOCK_CSR;
159     break;
160   case MAT_OFFDIAGBLOCK_CSR:
161     cusparseStruct->offdiagGPUMatFormat = MAT_OFFDIAGBLOCK_CSR;
162     break;
163   case MAT_DIAGBLOCK_DIA:
164   case MAT_OFFDIAGBLOCK_DIA:
165   case MAT_DIA:
166     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
167   case MAT_DIAGBLOCK_ELL:
168     cusparseStruct->diagGPUMatFormat = MAT_DIAGBLOCK_ELL;
169     break;
170   case MAT_OFFDIAGBLOCK_ELL:
171     cusparseStruct->offdiagGPUMatFormat = MAT_OFFDIAGBLOCK_ELL;
172     break;
173   case MAT_DIAGBLOCK_HYB:
174     cusparseStruct->diagGPUMatFormat = MAT_DIAGBLOCK_HYB;
175     break;
176   case MAT_OFFDIAGBLOCK_HYB:
177     cusparseStruct->offdiagGPUMatFormat = MAT_OFFDIAGBLOCK_HYB;
178     break;
179   default:
180     break;
181   }
182   PetscFunctionReturn(0);
183 }
184 
185 
186 EXTERN_C_BEGIN
187 #undef __FUNCT__
188 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
189 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
190 {
191   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
192   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
193   PetscErrorCode     ierr;
194   PetscInt       idxDiag=0,idxOffDiag=0;
195   char * formats[]={CSR,ELL,HYB};
196   MatOption diagFormat, offdiagFormat;
197   PetscBool      flg;
198   PetscFunctionBegin;
199   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, MPIAIJCUSPARSE Options","Mat");CHKERRQ(ierr);
200   if (A->factortype==MAT_FACTOR_NONE) {
201     ierr = PetscOptionsEList("-mat_mult_cusparse_diag_storage_format",
202 			     "Set the storage format of (mpi)aijcusparse gpu matrices for SpMV",
203 			     "None",formats,3,formats[0],&idxDiag,&flg);CHKERRQ(ierr);
204     ierr = PetscOptionsEList("-mat_mult_cusparse_offdiag_storage_format",
205 			     "Set the storage format of (mpi)aijcusparse gpu matrices for SpMV",
206 			     "None",formats,3,formats[0],&idxOffDiag,&flg);CHKERRQ(ierr);
207 
208     if (formats[idxDiag] == CSR)
209       diagFormat=MAT_CSR;
210     else if (formats[idxDiag] == ELL)
211       diagFormat=MAT_ELL;
212     else if (formats[idxDiag] == HYB)
213       diagFormat=MAT_HYB;
214 
215     if (formats[idxOffDiag] == CSR)
216       offdiagFormat=MAT_CSR;
217     else if (formats[idxOffDiag] == ELL)
218       offdiagFormat=MAT_ELL;
219     else if (formats[idxOffDiag] == HYB)
220       offdiagFormat=MAT_HYB;
221 
222     cusparseStruct->diagGPUMatFormat = diagFormat;
223     cusparseStruct->offdiagGPUMatFormat = offdiagFormat;
224   }
225   ierr = PetscOptionsEnd();CHKERRQ(ierr);
226   PetscFunctionReturn(0);
227 }
228 EXTERN_C_END
229 
230 EXTERN_C_BEGIN
231 #undef __FUNCT__
232 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
233 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
234 {
235   PetscErrorCode ierr;
236   Mat_MPIAIJ *a  = (Mat_MPIAIJ*)A->data;
237   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
238 
239   PetscFunctionBegin;
240   try {
241     delete cusparseStruct;
242   } catch(char* ex) {
243     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
244   }
245   cusparseStruct = 0;
246   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
247   PetscFunctionReturn(0);
248 }
249 EXTERN_C_END
250 
251 EXTERN_C_BEGIN
252 #undef __FUNCT__
253 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
254 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
255 {
256   PetscErrorCode ierr;
257   Mat_MPIAIJ *a;
258   Mat_MPIAIJCUSPARSE * cusparseStruct;
259 
260   PetscFunctionBegin;
261   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
262   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
263 					   "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
264 					   MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
265   a  = (Mat_MPIAIJ*)A->data;
266   a->spptr                      = new Mat_MPIAIJCUSPARSE;
267   cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
268   cusparseStruct->diagGPUMatFormat    = MAT_DIAGBLOCK_CSR;
269   cusparseStruct->offdiagGPUMatFormat = MAT_OFFDIAGBLOCK_CSR;
270   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
271   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
272   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
273   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
274   A->ops->setoption      = MatSetOption_MPIAIJCUSPARSE;
275   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
276   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 EXTERN_C_END
280 
281 
282 #undef __FUNCT__
283 #define __FUNCT__ "MatCreateAIJCUSPARSE"
284 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
285 {
286   PetscErrorCode ierr;
287   PetscMPIInt    size;
288 
289   PetscFunctionBegin;
290   ierr = MatCreate(comm,A);CHKERRQ(ierr);
291   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
292   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
293   if (size > 1) {
294     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
295     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
296   } else {
297     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
298     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
299   }
300   PetscFunctionReturn(0);
301 }
302 
303 /*MC
304    MATMPIAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" - A matrix type to be used for sparse matrices.
305 
306    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
307    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
308    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
309    for communicators controlling multiple processes.  It is recommended that you call both of
310    the above preallocation routines for simplicity.
311 
312    Options Database Keys:
313 . -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
314 
315   Level: beginner
316 
317 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE
318 M*/
319 
320