xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 97929ea760a70c780fc4b78d3065eb9e422113fe)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
19   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
20   if (d_nnz) {
21     for (i=0; i<B->rmap->n; i++) {
22       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23     }
24   }
25   if (o_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28     }
29   }
30   if (!B->preallocated) {
31     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
35     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
36     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
37     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
38     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
39     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
40   }
41   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
42   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
43   if (b->lvec) {
44     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
45   }
46   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
47   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
51   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
52   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
53   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
54 
55   B->preallocated = PETSC_TRUE;
56   PetscFunctionReturn(0);
57 }
58 
59 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
60 {
61   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
62   PetscErrorCode ierr;
63   PetscInt       nt;
64 
65   PetscFunctionBegin;
66   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
67   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
68   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
69   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
70   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
71   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
76 {
77   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
78   PetscErrorCode ierr;
79 
80   PetscFunctionBegin;
81   if (A->factortype == MAT_FACTOR_NONE) {
82     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
83     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
84     if (d_mat) {
85       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
86       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
87       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
88       cudaError_t  err;
89       PetscScalar  *vals;
90       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
91       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
92       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
93       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
94       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
95     }
96   }
97   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
98   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
99 
100   PetscFunctionReturn(0);
101 }
102 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
103 {
104   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
105   PetscErrorCode ierr;
106   PetscInt       nt;
107 
108   PetscFunctionBegin;
109   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
110   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
111   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
112   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
113   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
114   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
115   PetscFunctionReturn(0);
116 }
117 
118 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
119 {
120   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
121   PetscErrorCode ierr;
122   PetscInt       nt;
123 
124   PetscFunctionBegin;
125   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
126   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
127   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
128   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
129   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
130   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
135 {
136   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
137   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
138 
139   PetscFunctionBegin;
140   switch (op) {
141   case MAT_CUSPARSE_MULT_DIAG:
142     cusparseStruct->diagGPUMatFormat = format;
143     break;
144   case MAT_CUSPARSE_MULT_OFFDIAG:
145     cusparseStruct->offdiagGPUMatFormat = format;
146     break;
147   case MAT_CUSPARSE_ALL:
148     cusparseStruct->diagGPUMatFormat    = format;
149     cusparseStruct->offdiagGPUMatFormat = format;
150     break;
151   default:
152     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
153   }
154   PetscFunctionReturn(0);
155 }
156 
157 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
158 {
159   MatCUSPARSEStorageFormat format;
160   PetscErrorCode           ierr;
161   PetscBool                flg;
162   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
163   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
164 
165   PetscFunctionBegin;
166   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
167   if (A->factortype==MAT_FACTOR_NONE) {
168     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
169                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
170     if (flg) {
171       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
172     }
173     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
174                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
175     if (flg) {
176       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
177     }
178     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
179                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
180     if (flg) {
181       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
182     }
183   }
184   ierr = PetscOptionsTail();CHKERRQ(ierr);
185   PetscFunctionReturn(0);
186 }
187 
188 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
189 {
190   PetscErrorCode             ierr;
191   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
192   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
193   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
194   PetscInt                    nnz_state = A->nonzerostate;
195   PetscFunctionBegin;
196   if (d_mat) {
197     cudaError_t                err;
198     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
199   }
200   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
201   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
202     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
203   }
204   if (nnz_state > A->nonzerostate) {
205     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
206   }
207 
208   PetscFunctionReturn(0);
209 }
210 
211 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
212 {
213   PetscErrorCode     ierr;
214   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
215   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
216   cudaError_t        err;
217   cusparseStatus_t   stat;
218 
219   PetscFunctionBegin;
220   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
221   if (cusparseStruct->deviceMat) {
222     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
223     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
224     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
225     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
226     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
227     if (jaca->compressedrow.use) {
228       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
229     }
230     if (jacb->compressedrow.use) {
231       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
232     }
233     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
234     err = cudaFree(d_mat);CHKERRCUDA(err);
235   }
236   try {
237     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
238     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
239     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
240     if (cusparseStruct->stream) {
241       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
242     }
243     delete cusparseStruct;
244   } catch(char *ex) {
245     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
246   }
247   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
248   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
249   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
250   PetscFunctionReturn(0);
251 }
252 
253 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
254 {
255   PetscErrorCode     ierr;
256   Mat_MPIAIJ         *a;
257   Mat_MPIAIJCUSPARSE *cusparseStruct;
258   cusparseStatus_t   stat;
259   Mat                A;
260 
261   PetscFunctionBegin;
262   if (reuse == MAT_INITIAL_MATRIX) {
263     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
264   } else if (reuse == MAT_REUSE_MATRIX) {
265     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
266   }
267   A = *newmat;
268 
269   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
270   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
271 
272   a = (Mat_MPIAIJ*)A->data;
273   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
274   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
275   if (a->lvec) {
276     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
277   }
278 
279   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
280     a->spptr = new Mat_MPIAIJCUSPARSE;
281 
282     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
283     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
284     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
285     cusparseStruct->stream              = 0;
286     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
287     cusparseStruct->deviceMat = NULL;
288   }
289 
290   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
291   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
292   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
293   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
294   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
295   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
296   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
297 
298   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
299   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
300   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
301   PetscFunctionReturn(0);
302 }
303 
304 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
305 {
306   PetscErrorCode ierr;
307 
308   PetscFunctionBegin;
309   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
310   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
311   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
312   PetscFunctionReturn(0);
313 }
314 
315 /*@
316    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
317    (the default parallel PETSc format).  This matrix will ultimately pushed down
318    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
319    assembly performance the user should preallocate the matrix storage by setting
320    the parameter nz (or the array nnz).  By setting these parameters accurately,
321    performance during matrix assembly can be increased by more than a factor of 50.
322 
323    Collective
324 
325    Input Parameters:
326 +  comm - MPI communicator, set to PETSC_COMM_SELF
327 .  m - number of rows
328 .  n - number of columns
329 .  nz - number of nonzeros per row (same for all rows)
330 -  nnz - array containing the number of nonzeros in the various rows
331          (possibly different for each row) or NULL
332 
333    Output Parameter:
334 .  A - the matrix
335 
336    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
337    MatXXXXSetPreallocation() paradigm instead of this routine directly.
338    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
339 
340    Notes:
341    If nnz is given then nz is ignored
342 
343    The AIJ format (also called the Yale sparse matrix format or
344    compressed row storage), is fully compatible with standard Fortran 77
345    storage.  That is, the stored row and column indices can begin at
346    either one (as in Fortran) or zero.  See the users' manual for details.
347 
348    Specify the preallocated storage with either nz or nnz (not both).
349    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
350    allocation.  For large problems you MUST preallocate memory or you
351    will get TERRIBLE performance, see the users' manual chapter on matrices.
352 
353    By default, this format uses inodes (identical nodes) when possible, to
354    improve numerical efficiency of matrix-vector products and solves. We
355    search for consecutive rows with the same nonzero structure, thereby
356    reusing matrix information to achieve increased efficiency.
357 
358    Level: intermediate
359 
360 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
361 @*/
362 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
363 {
364   PetscErrorCode ierr;
365   PetscMPIInt    size;
366 
367   PetscFunctionBegin;
368   ierr = MatCreate(comm,A);CHKERRQ(ierr);
369   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
370   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
371   if (size > 1) {
372     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
373     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
374   } else {
375     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
376     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
377   }
378   PetscFunctionReturn(0);
379 }
380 
381 /*MC
382    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
383 
384    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
385    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
386    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
387 
388    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
389    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
390    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
391    for communicators controlling multiple processes.  It is recommended that you call both of
392    the above preallocation routines for simplicity.
393 
394    Options Database Keys:
395 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
396 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
397 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
398 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
399 
400   Level: beginner
401 
402  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
403 M
404 M*/
405 
406 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
407 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
408 {
409 #if defined(PETSC_USE_CTABLE)
410   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
411 #else
412   PetscSplitCSRDataStructure **p_d_mat;
413   PetscMPIInt                size,rank;
414   MPI_Comm                   comm;
415   PetscErrorCode             ierr;
416   int                        *ai,*bi,*aj,*bj;
417   PetscScalar                *aa,*ba;
418 
419   PetscFunctionBegin;
420   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
421   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
422   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
423   if (A->factortype == MAT_FACTOR_NONE) {
424     CsrMatrix *matrixA,*matrixB=NULL;
425     if (size == 1) {
426       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
427       p_d_mat = &cusparsestruct->deviceMat;
428       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
429       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
430 	matrixA = (CsrMatrix*)matstruct->mat;
431 	bi = bj = NULL; ba = NULL;
432       } else {
433 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
434       }
435     } else {
436       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
437       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
438       p_d_mat = &spptr->deviceMat;
439       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
440       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
441       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
442       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
443       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
444 	if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
445 	matrixA = (CsrMatrix*)matstructA->mat;
446 	matrixB = (CsrMatrix*)matstructB->mat;
447 	bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
448 	bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
449 	ba = thrust::raw_pointer_cast(matrixB->values->data());
450 	if (rank==-1) {
451 	  for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
452 	    std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
453 	  for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
454 	    std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
455 	  for(unsigned int i = 0; i < matrixB->values->size(); i++)
456 	    std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
457 	}
458       } else {
459 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
460       }
461     }
462     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
463     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
464     aa = thrust::raw_pointer_cast(matrixA->values->data());
465   } else {
466     *B = NULL;
467     PetscFunctionReturn(0);
468   }
469   // act like MatSetValues because not called on host
470   if (A->assembled) {
471     if (A->was_assembled) {
472       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
473     }
474     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
475   } else {
476     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
477   }
478   if (!*p_d_mat) {
479     cudaError_t                 err;
480     PetscSplitCSRDataStructure  *d_mat, h_mat;
481     Mat_SeqAIJ                  *jaca;
482     PetscInt                    n = A->rmap->n, nnz;
483     // create and copy
484     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
485     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
486     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
487     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
488     if (size == 1) {
489       jaca = (Mat_SeqAIJ*)A->data;
490       h_mat.nonzerostate = A->nonzerostate;
491       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
492       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
493       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
494       h_mat.offdiag.a = NULL;
495       h_mat.seq = PETSC_TRUE;
496     } else {
497       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
498       Mat_SeqAIJ  *jacb;
499       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
500       jaca = (Mat_SeqAIJ*)aij->A->data;
501       jacb = (Mat_SeqAIJ*)aij->B->data;
502       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
503       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
504       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
505       // create colmap - this is ussually done (lazy) in MatSetValues
506       aij->donotstash = PETSC_TRUE;
507       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
508       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
509       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
510       aij->colmap[A->cmap->N] = -9;
511       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
512       {
513 	PetscInt ii;
514 	for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
515       }
516       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
517       // allocate B copy data
518       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
519       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
520       nnz = jacb->i[n];
521 
522       if (jacb->compressedrow.use) {
523 	err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
524 	err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
525       } else {
526 	h_mat.offdiag.i = bi;
527       }
528       h_mat.offdiag.j = bj;
529       h_mat.offdiag.a = ba;
530 
531       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
532       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
533       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
534       h_mat.offdiag.n = n;
535     }
536     // allocate A copy data
537     nnz = jaca->i[n];
538     h_mat.diag.n = n;
539     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
540     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
541     if (jaca->compressedrow.use) {
542       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
543       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
544     } else {
545       h_mat.diag.i = ai;
546     }
547     h_mat.diag.j = aj;
548     h_mat.diag.a = aa;
549     // copy pointers and metdata to device
550     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
551     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
552   } else {
553     *B = *p_d_mat;
554   }
555   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
556   PetscFunctionReturn(0);
557 #endif
558 }
559