xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision efbe7e8a80d07327753dbe0b33efee01e046af3f)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 
10 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11 {
12   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14   PetscErrorCode     ierr;
15   PetscInt           i;
16 
17   PetscFunctionBegin;
18   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
19   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
20   if (d_nnz) {
21     for (i=0; i<B->rmap->n; i++) {
22       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23     }
24   }
25   if (o_nnz) {
26     for (i=0; i<B->rmap->n; i++) {
27       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28     }
29   }
30   if (!B->preallocated) {
31     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
34     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
35     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
36     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
37     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
38     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
39     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
40     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
41     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
42   }
43   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
44   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
45   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
46   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
47   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
48   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
49   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
50   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
51 
52   B->preallocated = PETSC_TRUE;
53   PetscFunctionReturn(0);
54 }
55 
56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
57 {
58   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
59   PetscErrorCode ierr;
60   PetscInt       nt;
61 
62   PetscFunctionBegin;
63   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
64   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
65   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
66   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
67   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
68   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
69   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
70   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
75 {
76   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
77   PetscErrorCode ierr;
78 
79   PetscFunctionBegin;
80   if (A->factortype == MAT_FACTOR_NONE) {
81     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
82     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
83     if (d_mat) {
84       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
85       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
86       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
87       cudaError_t  err;
88       PetscScalar  *vals;
89       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
90       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
91       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
92       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
93       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
94     }
95   }
96   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
97   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
98 
99   PetscFunctionReturn(0);
100 }
101 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
102 {
103   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
104   PetscErrorCode ierr;
105   PetscInt       nt;
106 
107   PetscFunctionBegin;
108   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
109   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
110   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
111   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
112   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
113   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
114   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
115   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
116   PetscFunctionReturn(0);
117 }
118 
119 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
120 {
121   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
122   PetscErrorCode ierr;
123   PetscInt       nt;
124 
125   PetscFunctionBegin;
126   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
127   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
128   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
129   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
130   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
131   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
132   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
133   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
134   PetscFunctionReturn(0);
135 }
136 
137 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
138 {
139   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
140   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
141 
142   PetscFunctionBegin;
143   switch (op) {
144   case MAT_CUSPARSE_MULT_DIAG:
145     cusparseStruct->diagGPUMatFormat = format;
146     break;
147   case MAT_CUSPARSE_MULT_OFFDIAG:
148     cusparseStruct->offdiagGPUMatFormat = format;
149     break;
150   case MAT_CUSPARSE_ALL:
151     cusparseStruct->diagGPUMatFormat    = format;
152     cusparseStruct->offdiagGPUMatFormat = format;
153     break;
154   default:
155     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
156   }
157   PetscFunctionReturn(0);
158 }
159 
160 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
161 {
162   MatCUSPARSEStorageFormat format;
163   PetscErrorCode           ierr;
164   PetscBool                flg;
165   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
166   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
167 
168   PetscFunctionBegin;
169   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
170   if (A->factortype==MAT_FACTOR_NONE) {
171     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
172                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
173     if (flg) {
174       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
175     }
176     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
177                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
178     if (flg) {
179       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
180     }
181     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsTail();CHKERRQ(ierr);
188   PetscFunctionReturn(0);
189 }
190 
191 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
192 {
193   PetscErrorCode             ierr;
194   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
195   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
196   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
197   PetscInt                    nnz_state = A->nonzerostate;
198   PetscFunctionBegin;
199   if (d_mat) {
200     cudaError_t                err;
201     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
202   }
203   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
204   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
205     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
206   }
207   if (nnz_state > A->nonzerostate) {
208     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
209   }
210 
211   PetscFunctionReturn(0);
212 }
213 
214 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
215 {
216   PetscErrorCode     ierr;
217   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
218   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
219   cudaError_t        err;
220   cusparseStatus_t   stat;
221 
222   PetscFunctionBegin;
223   if (cusparseStruct->deviceMat) {
224     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
225     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
226     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
227     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
228     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
229     if (jaca->compressedrow.use) {
230       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
231     }
232     if (jacb->compressedrow.use) {
233       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
234     }
235     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
236     err = cudaFree(d_mat);CHKERRCUDA(err);
237   }
238   try {
239     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
240     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
241     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
242     if (cusparseStruct->stream) {
243       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
244     }
245     delete cusparseStruct;
246   } catch(char *ex) {
247     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
248   }
249   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
250   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
251   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
252   PetscFunctionReturn(0);
253 }
254 
255 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
256 {
257   PetscErrorCode     ierr;
258   Mat_MPIAIJ         *a;
259   Mat_MPIAIJCUSPARSE *cusparseStruct;
260   cusparseStatus_t   stat;
261   Mat                A;
262 
263   PetscFunctionBegin;
264   if (reuse == MAT_INITIAL_MATRIX) {
265     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
266   } else if (reuse == MAT_REUSE_MATRIX) {
267     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
268   }
269   A = *newmat;
270 
271   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
272   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
273 
274   a = (Mat_MPIAIJ*)A->data;
275   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
276     a->spptr = new Mat_MPIAIJCUSPARSE;
277 
278     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
279     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
280     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
281     cusparseStruct->stream              = 0;
282     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
283     cusparseStruct->deviceMat = NULL;
284   }
285 
286   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
287   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
288   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
289   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
290   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
291   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
292   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
293 
294   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
295   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
296   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
301 {
302   PetscErrorCode ierr;
303 
304   PetscFunctionBegin;
305   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
306   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
307   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
308   PetscFunctionReturn(0);
309 }
310 
311 /*@
312    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
313    (the default parallel PETSc format).  This matrix will ultimately pushed down
314    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
315    assembly performance the user should preallocate the matrix storage by setting
316    the parameter nz (or the array nnz).  By setting these parameters accurately,
317    performance during matrix assembly can be increased by more than a factor of 50.
318 
319    Collective
320 
321    Input Parameters:
322 +  comm - MPI communicator, set to PETSC_COMM_SELF
323 .  m - number of rows
324 .  n - number of columns
325 .  nz - number of nonzeros per row (same for all rows)
326 -  nnz - array containing the number of nonzeros in the various rows
327          (possibly different for each row) or NULL
328 
329    Output Parameter:
330 .  A - the matrix
331 
332    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
333    MatXXXXSetPreallocation() paradigm instead of this routine directly.
334    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
335 
336    Notes:
337    If nnz is given then nz is ignored
338 
339    The AIJ format (also called the Yale sparse matrix format or
340    compressed row storage), is fully compatible with standard Fortran 77
341    storage.  That is, the stored row and column indices can begin at
342    either one (as in Fortran) or zero.  See the users' manual for details.
343 
344    Specify the preallocated storage with either nz or nnz (not both).
345    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
346    allocation.  For large problems you MUST preallocate memory or you
347    will get TERRIBLE performance, see the users' manual chapter on matrices.
348 
349    By default, this format uses inodes (identical nodes) when possible, to
350    improve numerical efficiency of matrix-vector products and solves. We
351    search for consecutive rows with the same nonzero structure, thereby
352    reusing matrix information to achieve increased efficiency.
353 
354    Level: intermediate
355 
356 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
357 @*/
358 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
359 {
360   PetscErrorCode ierr;
361   PetscMPIInt    size;
362 
363   PetscFunctionBegin;
364   ierr = MatCreate(comm,A);CHKERRQ(ierr);
365   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
366   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
367   if (size > 1) {
368     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
369     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
370   } else {
371     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
372     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
373   }
374   PetscFunctionReturn(0);
375 }
376 
377 /*MC
378    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
379 
380    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
381    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
382    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
383 
384    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
385    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
386    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
387    for communicators controlling multiple processes.  It is recommended that you call both of
388    the above preallocation routines for simplicity.
389 
390    Options Database Keys:
391 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
392 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
393 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
394 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
395 
396   Level: beginner
397 
398  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
399 M
400 M*/
401 
402 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
403 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
404 {
405 #if defined(PETSC_USE_CTABLE)
406   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
407 #else
408   PetscSplitCSRDataStructure **p_d_mat;
409   PetscMPIInt                size,rank;
410   MPI_Comm                   comm;
411   PetscErrorCode             ierr;
412   int                        *ai,*bi,*aj,*bj;
413   PetscScalar                *aa,*ba;
414 
415   PetscFunctionBegin;
416   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
417   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
418   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
419   if (A->factortype == MAT_FACTOR_NONE) {
420     CsrMatrix *matrixA,*matrixB=NULL;
421     if (size == 1) {
422       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
423       p_d_mat = &cusparsestruct->deviceMat;
424       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
425       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
426 	matrixA = (CsrMatrix*)matstruct->mat;
427 	bi = bj = NULL; ba = NULL;
428       } else {
429 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
430       }
431     } else {
432       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
433       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
434       p_d_mat = &spptr->deviceMat;
435       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
436       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
437       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
438       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
439       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
440 	if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
441 	matrixA = (CsrMatrix*)matstructA->mat;
442 	matrixB = (CsrMatrix*)matstructB->mat;
443 	bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
444 	bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
445 	ba = thrust::raw_pointer_cast(matrixB->values->data());
446 	if (rank==-1) {
447 	  for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
448 	    std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
449 	  for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
450 	    std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
451 	  for(unsigned int i = 0; i < matrixB->values->size(); i++)
452 	    std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
453 	}
454       } else {
455 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
456       }
457     }
458     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
459     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
460     aa = thrust::raw_pointer_cast(matrixA->values->data());
461   } else {
462     *B = NULL;
463     PetscFunctionReturn(0);
464   }
465   // act like MatSetValues because not called on host
466   if (A->assembled) {
467     if (A->was_assembled) {
468       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
469     }
470     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
471   } else {
472     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
473   }
474   if (!*p_d_mat) {
475     cudaError_t                 err;
476     PetscSplitCSRDataStructure  *d_mat, h_mat;
477     Mat_SeqAIJ                  *jaca;
478     PetscInt                    n = A->rmap->n, nnz;
479     // create and copy
480     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
481     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
482     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
483     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
484     if (size == 1) {
485       jaca = (Mat_SeqAIJ*)A->data;
486       h_mat.nonzerostate = A->nonzerostate;
487       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
488       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
489       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
490       h_mat.offdiag.a = NULL;
491       h_mat.seq = PETSC_TRUE;
492     } else {
493       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
494       Mat_SeqAIJ  *jacb;
495       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
496       jaca = (Mat_SeqAIJ*)aij->A->data;
497       jacb = (Mat_SeqAIJ*)aij->B->data;
498       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
499       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
500       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
501       // create colmap - this is ussually done (lazy) in MatSetValues
502       aij->donotstash = PETSC_TRUE;
503       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
504       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
505       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
506       aij->colmap[A->cmap->N] = -9;
507       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
508       {
509 	PetscInt ii;
510 	for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
511       }
512       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
513       // allocate B copy data
514       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
515       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
516       nnz = jacb->i[n];
517 
518       if (jacb->compressedrow.use) {
519 	err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
520 	err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
521       } else {
522 	h_mat.offdiag.i = bi;
523       }
524       h_mat.offdiag.j = bj;
525       h_mat.offdiag.a = ba;
526 
527       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
528       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
529       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
530       h_mat.offdiag.n = n;
531     }
532     // allocate A copy data
533     nnz = jaca->i[n];
534     h_mat.diag.n = n;
535     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
536     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
537     if (jaca->compressedrow.use) {
538       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
539       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
540     } else {
541       h_mat.diag.i = ai;
542     }
543     h_mat.diag.j = aj;
544     h_mat.diag.a = aa;
545     // copy pointers and metdata to device
546     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
547     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
548   } else {
549     *B = *p_d_mat;
550   }
551   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
552   PetscFunctionReturn(0);
553 #endif
554 }
555