xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision c0174eb74783e1ccfa817272a5782dd72b4cfcef)
1 #include "petscconf.h"
2 PETSC_CUDA_EXTERN_C_BEGIN
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
4 PETSC_CUDA_EXTERN_C_END
5 #include "mpicusparsematimpl.h"
6 
7 EXTERN_C_BEGIN
8 #undef __FUNCT__
9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode ierr;
15   PetscInt       i;
16 
17   PetscFunctionBegin;
18   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
19   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
20   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
21   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
22 
23   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
24   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
25   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
26   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
27   if (d_nnz) {
28     for (i=0; i<B->rmap->n; i++) {
29       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
30     }
31   }
32   if (o_nnz) {
33     for (i=0; i<B->rmap->n; i++) {
34       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
35     }
36   }
37   if (!B->preallocated) {
38     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
39     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
40     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
41     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
42     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
43     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
44     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
45     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
46     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
47   }
48   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
49   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
50   ierr=MatSetOption_SeqAIJCUSPARSE(b->A,cusparseStruct->diagGPUMatFormat,PETSC_TRUE);CHKERRQ(ierr);
51   ierr=MatSetOption_SeqAIJCUSPARSE(b->B,cusparseStruct->offdiagGPUMatFormat,PETSC_TRUE);CHKERRQ(ierr);
52   B->preallocated = PETSC_TRUE;
53   PetscFunctionReturn(0);
54 }
55 EXTERN_C_END
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
58 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
59 {
60   PetscErrorCode ierr;
61 
62   PetscFunctionBegin;
63   if (right) {
64     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
65     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
66     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
67     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
68     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
69   }
70   if (left) {
71     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
72     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
73     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
74     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
75     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
76   }
77   PetscFunctionReturn(0);
78 }
79 
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
83 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
84 {
85   // This multiplication sequence is different sequence
86   // than the CPU version. In particular, the diagonal block
87   // multiplication kernel is launched in one stream. Then,
88   // in a separate stream, the data transfers from DeviceToHost
89   // (with MPI messaging in between), then HostToDevice are
90   // launched. Once the data transfer stream is synchronized,
91   // to ensure messaging is complete, the MatMultAdd kernel
92   // is launched in the original (MatMult) stream to protect
93   // against race conditions.
94   //
95   // This sequence should only be called for GPU computation.
96   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
97   PetscErrorCode ierr;
98   PetscInt       nt;
99 
100   PetscFunctionBegin;
101   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
102   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
103   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
104   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
105   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
107   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
108   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
109   PetscFunctionReturn(0);
110 }
111 
112 #undef __FUNCT__
113 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
114 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
115 {
116   // This multiplication sequence is different sequence
117   // than the CPU version. In particular, the diagonal block
118   // multiplication kernel is launched in one stream. Then,
119   // in a separate stream, the data transfers from DeviceToHost
120   // (with MPI messaging in between), then HostToDevice are
121   // launched. Once the data transfer stream is synchronized,
122   // to ensure messaging is complete, the MatMultAdd kernel
123   // is launched in the original (MatMult) stream to protect
124   // against race conditions.
125   //
126   // This sequence should only be called for GPU computation.
127   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
128   PetscErrorCode ierr;
129   PetscInt       nt;
130 
131   PetscFunctionBegin;
132   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
133   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
134   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
135   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
136   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
137   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
138   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
139   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
140   PetscFunctionReturn(0);
141 }
142 
143 //PetscErrorCode MatSetValuesBatch_MPIAIJCUSPARSE(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);
144 
145 
146 #undef __FUNCT__
147 #define __FUNCT__ "MatSetOption_MPIAIJCUSPARSE"
148 PetscErrorCode MatSetOption_MPIAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
149 {
150   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
151   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
152   PetscErrorCode ierr;
153 
154   PetscFunctionBegin;
155   ierr = MatSetOption_MPIAIJ(A,op,flg);CHKERRQ(ierr);
156   switch (op) {
157   case MAT_DIAGBLOCK_CSR:
158     cusparseStruct->diagGPUMatFormat = MAT_DIAGBLOCK_CSR;
159     break;
160   case MAT_OFFDIAGBLOCK_CSR:
161     cusparseStruct->offdiagGPUMatFormat = MAT_OFFDIAGBLOCK_CSR;
162     break;
163   case MAT_DIAGBLOCK_DIA:
164   case MAT_OFFDIAGBLOCK_DIA:
165   case MAT_DIA:
166     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
167   case MAT_DIAGBLOCK_ELL:
168     cusparseStruct->diagGPUMatFormat = MAT_DIAGBLOCK_ELL;
169     break;
170   case MAT_OFFDIAGBLOCK_ELL:
171     cusparseStruct->offdiagGPUMatFormat = MAT_OFFDIAGBLOCK_ELL;
172     break;
173   case MAT_DIAGBLOCK_HYB:
174     cusparseStruct->diagGPUMatFormat = MAT_DIAGBLOCK_HYB;
175     break;
176   case MAT_OFFDIAGBLOCK_HYB:
177     cusparseStruct->offdiagGPUMatFormat = MAT_OFFDIAGBLOCK_HYB;
178     break;
179   default:
180     break;
181   }
182   PetscFunctionReturn(0);
183 }
184 
185 
186 EXTERN_C_BEGIN
187 #undef __FUNCT__
188 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
189 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
190 {
191   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
192   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
193   PetscErrorCode     ierr;
194   PetscInt       idxDiag=0,idxOffDiag=0;
195   char * formats[]={CSR,ELL,HYB};
196   MatOption diagFormat, offdiagFormat;
197   PetscBool      flg;
198   PetscFunctionBegin;
199   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, MPIAIJCUSPARSE Options","Mat");CHKERRQ(ierr);
200   if (A->factortype==MAT_FACTOR_NONE) {
201     ierr = PetscOptionsEList("-mat_mult_cusparse_diag_storage_format",
202 			     "Set the storage format of (mpi)aijcusparse gpu matrices for SpMV",
203 			     "None",formats,3,formats[0],&idxDiag,&flg);CHKERRQ(ierr);
204     ierr = PetscOptionsEList("-mat_mult_cusparse_offdiag_storage_format",
205 			     "Set the storage format of (mpi)aijcusparse gpu matrices for SpMV",
206 			     "None",formats,3,formats[0],&idxOffDiag,&flg);CHKERRQ(ierr);
207 
208     switch (idxDiag)
209       {
210       case 0:
211 	diagFormat=MAT_CSR;
212 	break;
213       case 2:
214 	diagFormat=MAT_ELL;
215 	break;
216       case 3:
217 	diagFormat=MAT_HYB;
218 	break;
219       }
220 
221     switch (idxOffDiag)
222       {
223       case 0:
224 	offdiagFormat=MAT_CSR;
225 	break;
226       case 2:
227 	offdiagFormat=MAT_ELL;
228 	break;
229       case 3:
230 	offdiagFormat=MAT_HYB;
231 	break;
232       }
233     cusparseStruct->diagGPUMatFormat = diagFormat;
234     cusparseStruct->offdiagGPUMatFormat = offdiagFormat;
235   }
236   ierr = PetscOptionsEnd();CHKERRQ(ierr);
237   PetscFunctionReturn(0);
238 }
239 EXTERN_C_END
240 
241 EXTERN_C_BEGIN
242 #undef __FUNCT__
243 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
244 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
245 {
246   PetscErrorCode ierr;
247   Mat_MPIAIJ *a  = (Mat_MPIAIJ*)A->data;
248   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
249 
250   PetscFunctionBegin;
251   try {
252     delete cusparseStruct;
253   } catch(char* ex) {
254     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
255   }
256   cusparseStruct = 0;
257   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
258   PetscFunctionReturn(0);
259 }
260 EXTERN_C_END
261 
262 EXTERN_C_BEGIN
263 #undef __FUNCT__
264 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
265 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
266 {
267   PetscErrorCode ierr;
268   Mat_MPIAIJ *a;
269   Mat_MPIAIJCUSPARSE * cusparseStruct;
270 
271   PetscFunctionBegin;
272   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
273   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
274 					   "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
275 					   MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
276   a  = (Mat_MPIAIJ*)A->data;
277   a->spptr                      = new Mat_MPIAIJCUSPARSE;
278   cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
279   cusparseStruct->diagGPUMatFormat    = MAT_DIAGBLOCK_CSR;
280   cusparseStruct->offdiagGPUMatFormat = MAT_OFFDIAGBLOCK_CSR;
281   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
282   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
283   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
284   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
285   A->ops->setoption      = MatSetOption_MPIAIJCUSPARSE;
286   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
287   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
288   PetscFunctionReturn(0);
289 }
290 EXTERN_C_END
291 
292 
293 #undef __FUNCT__
294 #define __FUNCT__ "MatCreateAIJCUSPARSE"
295 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
296 {
297   PetscErrorCode ierr;
298   PetscMPIInt    size;
299 
300   PetscFunctionBegin;
301   ierr = MatCreate(comm,A);CHKERRQ(ierr);
302   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
303   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
304   if (size > 1) {
305     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
306     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
307   } else {
308     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
309     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
310   }
311   PetscFunctionReturn(0);
312 }
313 
314 /*MC
315    MATMPIAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" - A matrix type to be used for sparse matrices.
316 
317    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
318    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
319    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
320    for communicators controlling multiple processes.  It is recommended that you call both of
321    the above preallocation routines for simplicity.
322 
323    Options Database Keys:
324 . -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
325 
326   Level: beginner
327 
328 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE
329 M*/
330 
331