xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision ca45077f9abf8e7344aac749d49eef6cfd88f621)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
2 
3 EXTERN_C_BEGIN
4 #undef __FUNCT__
5 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
6 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
7 {
8   Mat_MPIAIJ     *b;
9   PetscErrorCode ierr;
10   PetscInt       i;
11 
12   PetscFunctionBegin;
13   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
14   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
15   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
16   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
17 
18   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
19   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
20   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
21   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
22   if (d_nnz) {
23     for (i=0; i<B->rmap->n; i++) {
24       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
25     }
26   }
27   if (o_nnz) {
28     for (i=0; i<B->rmap->n; i++) {
29       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
30     }
31   }
32   b = (Mat_MPIAIJ*)B->data;
33 
34   if (!B->preallocated) {
35     /* Explicitly create 2 MATSEQAIJ matrices. */
36     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
37     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
38     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
39     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
40     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
41     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
42     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
43     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
44   }
45 
46   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
47   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
48   ierr=MatSetOption_SeqAIJCUSPARSE(b->A,b->diagGPUMatFormat,PETSC_TRUE);CHKERRQ(ierr);
49   ierr=MatSetOption_SeqAIJCUSPARSE(b->B,b->offdiagGPUMatFormat,PETSC_TRUE);CHKERRQ(ierr);
50   B->preallocated = PETSC_TRUE;
51   PetscFunctionReturn(0);
52 }
53 EXTERN_C_END
54 #undef __FUNCT__
55 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
56 PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
57 {
58   PetscErrorCode ierr;
59 
60   PetscFunctionBegin;
61   if (right) {
62     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
63     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
64     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
65     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
66     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
67   }
68   if (left) {
69     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
70     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
71     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
72     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
73     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
74   }
75   PetscFunctionReturn(0);
76 }
77 
78 
79 #undef __FUNCT__
80 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
81 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
82 {
83   // This multiplication sequence is different sequence
84   // than the CPU version. In particular, the diagonal block
85   // multiplication kernel is launched in one stream. Then,
86   // in a separate stream, the data transfers from DeviceToHost
87   // (with MPI messaging in between), then HostToDevice are
88   // launched. Once the data transfer stream is synchronized,
89   // to ensure messaging is complete, the MatMultAdd kernel
90   // is launched in the original (MatMult) stream to protect
91   // against race conditions.
92   //
93   // This sequence should only be called for GPU computation.
94   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
95   PetscErrorCode ierr;
96   PetscInt       nt;
97 
98   PetscFunctionBegin;
99   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
100   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
101   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
102   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
103   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
104   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
106   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
107   PetscFunctionReturn(0);
108 }
109 
110 #undef __FUNCT__
111 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
112 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
113 {
114   // This multiplication sequence is different sequence
115   // than the CPU version. In particular, the diagonal block
116   // multiplication kernel is launched in one stream. Then,
117   // in a separate stream, the data transfers from DeviceToHost
118   // (with MPI messaging in between), then HostToDevice are
119   // launched. Once the data transfer stream is synchronized,
120   // to ensure messaging is complete, the MatMultAdd kernel
121   // is launched in the original (MatMult) stream to protect
122   // against race conditions.
123   //
124   // This sequence should only be called for GPU computation.
125   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
126   PetscErrorCode ierr;
127   PetscInt       nt;
128 
129   PetscFunctionBegin;
130   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
131   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
132   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
133   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
134   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
135   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
136   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
137   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
138   PetscFunctionReturn(0);
139 }
140 
141 EXTERN_C_BEGIN
142 PetscErrorCode  MatCreate_MPIAIJ(Mat);
143 EXTERN_C_END
144 //PetscErrorCode MatSetValuesBatch_MPIAIJCUSPARSE(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);
145 
146 
147 #undef __FUNCT__
148 #define __FUNCT__ "MatSetOption_MPIAIJCUSPARSE"
149 PetscErrorCode MatSetOption_MPIAIJCUSPARSE(Mat A,MatOption op,PetscBool  flg)
150 {
151   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
152   PetscErrorCode ierr;
153 
154   PetscFunctionBegin;
155   ierr = MatSetOption_MPIAIJ(A,op,flg);CHKERRQ(ierr);
156   switch (op) {
157   case DIAGBLOCK_MAT_CSR:
158     a->diagGPUMatFormat = DIAGBLOCK_MAT_CSR;
159     break;
160   case OFFDIAGBLOCK_MAT_CSR:
161     a->offdiagGPUMatFormat = OFFDIAGBLOCK_MAT_CSR;
162     break;
163   case DIAGBLOCK_MAT_DIA:
164   case OFFDIAGBLOCK_MAT_DIA:
165   case MAT_DIA:
166     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
167   case DIAGBLOCK_MAT_ELL:
168     a->diagGPUMatFormat = DIAGBLOCK_MAT_ELL;
169     break;
170   case OFFDIAGBLOCK_MAT_ELL:
171     a->offdiagGPUMatFormat = OFFDIAGBLOCK_MAT_ELL;
172     break;
173   case DIAGBLOCK_MAT_HYB:
174     a->diagGPUMatFormat = DIAGBLOCK_MAT_HYB;
175     break;
176   case OFFDIAGBLOCK_MAT_HYB:
177     a->offdiagGPUMatFormat = OFFDIAGBLOCK_MAT_HYB;
178     break;
179   default:
180     break;
181   }
182   PetscFunctionReturn(0);
183 }
184 
185 
186 EXTERN_C_BEGIN
187 #undef __FUNCT__
188 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
189 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
190 {
191   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
192   PetscErrorCode     ierr;
193   PetscInt       idxDiag=0,idxOffDiag=0;
194   char * formats[]={CSR,ELL,HYB};
195   MatOption diagFormat, offdiagFormat;
196   PetscBool      flg;
197   PetscFunctionBegin;
198   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, MPIAIJCUSPARSE Options","Mat");CHKERRQ(ierr);
199   if (A->factortype==MAT_FACTOR_NONE) {
200     ierr = PetscOptionsEList("-mat_mult_cusparse_diag_storage_format",
201 			     "Set the storage format of (mpi)aijcusparse gpu matrices for SpMV",
202 			     "None",formats,3,formats[0],&idxDiag,&flg);CHKERRQ(ierr);
203     ierr = PetscOptionsEList("-mat_mult_cusparse_offdiag_storage_format",
204 			     "Set the storage format of (mpi)aijcusparse gpu matrices for SpMV",
205 			     "None",formats,3,formats[0],&idxOffDiag,&flg);CHKERRQ(ierr);
206 
207     //printf("MatSetFromOptions_MPIAIJCUSPARSE : %s %s\n",formats[idxDiag],formats[idxOffDiag]);
208     if (formats[idxDiag] == CSR)
209       diagFormat=MAT_CSR;
210     else if (formats[idxDiag] == ELL)
211       diagFormat=MAT_ELL;
212     else if (formats[idxDiag] == HYB)
213       diagFormat=MAT_HYB;
214 
215     if (formats[idxOffDiag] == CSR)
216       offdiagFormat=MAT_CSR;
217     else if (formats[idxOffDiag] == ELL)
218       offdiagFormat=MAT_ELL;
219     else if (formats[idxOffDiag] == HYB)
220       offdiagFormat=MAT_HYB;
221 
222     a->diagGPUMatFormat = diagFormat;
223     a->offdiagGPUMatFormat = offdiagFormat;
224   }
225   ierr = PetscOptionsEnd();CHKERRQ(ierr);
226   PetscFunctionReturn(0);
227 }
228 EXTERN_C_END
229 
230 
231 EXTERN_C_BEGIN
232 #undef __FUNCT__
233 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
234 PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat B)
235 {
236   PetscErrorCode ierr;
237 
238   PetscFunctionBegin;
239   ierr = MatCreate_MPIAIJ(B);CHKERRQ(ierr);
240   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatMPIAIJSetPreallocation_C",
241                                      "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
242                                       MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
243   B->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
244   B->ops->mult           = MatMult_MPIAIJCUSPARSE;
245   B->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
246   B->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
247   B->ops->setoption      = MatSetOption_MPIAIJCUSPARSE;
248   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
249   PetscFunctionReturn(0);
250 }
251 EXTERN_C_END
252 
253 
254 #undef __FUNCT__
255 #define __FUNCT__ "MatCreateAIJCUSPARSE"
256 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
257 {
258   PetscErrorCode ierr;
259   PetscMPIInt    size;
260 
261   PetscFunctionBegin;
262   ierr = MatCreate(comm,A);CHKERRQ(ierr);
263   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
264   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
265   if (size > 1) {
266     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
267     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
268   } else {
269     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
270     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
271   }
272   PetscFunctionReturn(0);
273 }
274 
275 /*MC
276    MATAIJCUSPARSE - MATAIJCUSPARSE = "aijcusparse" - A matrix type to be used for sparse matrices.
277 
278    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
279    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
280   MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
281   for communicators controlling multiple processes.  It is recommended that you call both of
282   the above preallocation routines for simplicity.
283 
284    Options Database Keys:
285 . -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
286 
287   Level: beginner
288 
289 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE
290 M*/
291 
292