xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2065540a855ff9f9c49aa4d22d544ff2b07d8a79)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28 
29   PetscFunctionBegin;
30   if (cusp->coo_p && v) {
31     thrust::device_ptr<const PetscScalar> d_v;
32     THRUSTARRAY                           *w = NULL;
33 
34     if (isCudaMem(v)) {
35       d_v = thrust::device_pointer_cast(v);
36     } else {
37       w = new THRUSTARRAY(n);
38       w->assign(v,v+n);
39       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
40       d_v = w->data();
41     }
42 
43     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
44                                                               cusp->coo_pw->begin()));
45     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
46                                                               cusp->coo_pw->end()));
47     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
48     thrust::for_each(zibit,zieit,VecCUDAEquals());
49     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
50     delete w;
51     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
52     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
53   } else {
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
55     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
56   }
57   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
58   A->num_ass++;
59   A->assembled        = PETSC_TRUE;
60   A->ass_nonzerostate = A->nonzerostate;
61   A->offloadmask      = PETSC_OFFLOAD_GPU;
62   PetscFunctionReturn(0);
63 }
64 
65 template <typename Tuple>
66 struct IsNotOffDiagT
67 {
68   PetscInt _cstart,_cend;
69 
70   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
71   __host__ __device__
72   inline bool operator()(Tuple t)
73   {
74     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
75   }
76 };
77 
78 struct IsOffDiag
79 {
80   PetscInt _cstart,_cend;
81 
82   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
83   __host__ __device__
84   inline bool operator() (const PetscInt &c)
85   {
86     return c < _cstart || c >= _cend;
87   }
88 };
89 
90 struct GlobToLoc
91 {
92   PetscInt _start;
93 
94   GlobToLoc(PetscInt start) : _start(start) {}
95   __host__ __device__
96   inline PetscInt operator() (const PetscInt &c)
97   {
98     return c - _start;
99   }
100 };
101 
102 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
103 {
104   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
105   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
106   PetscErrorCode         ierr;
107   PetscInt               *jj;
108   size_t                 noff = 0;
109   THRUSTINTARRAY         d_i(n);
110   THRUSTINTARRAY         d_j(n);
111   ISLocalToGlobalMapping l2g;
112   cudaError_t            cerr;
113 
114   PetscFunctionBegin;
115   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
116   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
117   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
118   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
119   ierr = PetscFree(b->garray);CHKERRQ(ierr);
120   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
122   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
123 
124   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
125   d_i.assign(coo_i,coo_i+n);
126   d_j.assign(coo_j,coo_j+n);
127   delete cusp->coo_p;
128   delete cusp->coo_pw;
129   cusp->coo_p = NULL;
130   cusp->coo_pw = NULL;
131   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
132   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
134   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
135     cusp->coo_p = new THRUSTINTARRAY(n);
136     cusp->coo_pw = new THRUSTARRAY(n);
137     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
138     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
139     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
140     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
141     firstoffd = mzipp.get_iterator_tuple().get<1>();
142   }
143   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
144   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
145 
146   /* from global to local */
147   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
148   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
149   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
150 
151   /* copy offdiag column indices to map on the CPU */
152   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
153   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
154   auto o_j = d_j.begin();
155   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156   thrust::advance(o_j,cusp->coo_nd);
157   thrust::sort(thrust::device,o_j,d_j.end());
158   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
159   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
160   noff = thrust::distance(o_j,wit);
161   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
162   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
163   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
165   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
166   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
167   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
168   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
169 
170   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
171   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
172   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
173   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
174   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
175   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
176   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
177   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
178 
179   /* GPU memory, cusparse specific call handles it internally */
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
181   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
182   ierr = PetscFree(jj);CHKERRQ(ierr);
183 
184   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
187   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
188   /*
189   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
190   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
191   */
192   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
193   B->preallocated = PETSC_TRUE;
194   B->nonzerostate++;
195 
196   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
197   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
198   B->offloadmask = PETSC_OFFLOAD_CPU;
199   B->assembled = PETSC_FALSE;
200   B->was_assembled = PETSC_FALSE;
201   PetscFunctionReturn(0);
202 }
203 
204 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
205 {
206   Mat            Ad,Ao;
207   const PetscInt *cmap;
208   PetscErrorCode ierr;
209 
210   PetscFunctionBegin;
211   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
212   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
213   if (glob) {
214     PetscInt cst, i, dn, on, *gidx;
215 
216     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
217     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
218     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
219     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
220     for (i=0; i<dn; i++) gidx[i]    = cst + i;
221     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
222     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
223   }
224   PetscFunctionReturn(0);
225 }
226 
227 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
228 {
229   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
230   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
231   PetscErrorCode     ierr;
232   PetscInt           i;
233 
234   PetscFunctionBegin;
235   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
236   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
237   if (PetscDefined(USE_DEBUG) && d_nnz) {
238     for (i=0; i<B->rmap->n; i++) {
239       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
240     }
241   }
242   if (PetscDefined(USE_DEBUG) && o_nnz) {
243     for (i=0; i<B->rmap->n; i++) {
244       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
245     }
246   }
247 #if defined(PETSC_USE_CTABLE)
248   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
249 #else
250   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
251 #endif
252   ierr = PetscFree(b->garray);CHKERRQ(ierr);
253   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
254   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
255   /* Because the B will have been resized we simply destroy it and create a new one each time */
256   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
257   if (!b->A) {
258     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
259     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
260     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
261   }
262   if (!b->B) {
263     PetscMPIInt size;
264     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
265     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
266     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
267     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
268   }
269   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
270   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
271   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
272   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
273   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
274   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
276   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
278   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
279   /* Let A, B use b's handle with pre-set stream
280   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
281   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
282   */
283   B->preallocated = PETSC_TRUE;
284   PetscFunctionReturn(0);
285 }
286 
287 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
288 {
289   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
290   PetscErrorCode ierr;
291   PetscInt       nt;
292 
293   PetscFunctionBegin;
294   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
295   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
296   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
297   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
298   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
299   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
304 {
305   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
306   PetscErrorCode ierr;
307 
308   PetscFunctionBegin;
309   if (A->factortype == MAT_FACTOR_NONE) {
310     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
311     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
312     if (d_mat) {
313       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
314       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
315       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
316       cudaError_t  err;
317       PetscScalar  *vals;
318       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
319       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
320       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
321       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
322       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
323     }
324   }
325   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
326   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
327 
328   PetscFunctionReturn(0);
329 }
330 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
331 {
332   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
333   PetscErrorCode ierr;
334   PetscInt       nt;
335 
336   PetscFunctionBegin;
337   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
338   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
339   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
340   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
341   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
342   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
343   PetscFunctionReturn(0);
344 }
345 
346 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
347 {
348   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
349   PetscErrorCode ierr;
350   PetscInt       nt;
351 
352   PetscFunctionBegin;
353   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
354   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
355   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
356   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
357   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
358   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
359   PetscFunctionReturn(0);
360 }
361 
362 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
363 {
364   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
365   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
366 
367   PetscFunctionBegin;
368   switch (op) {
369   case MAT_CUSPARSE_MULT_DIAG:
370     cusparseStruct->diagGPUMatFormat = format;
371     break;
372   case MAT_CUSPARSE_MULT_OFFDIAG:
373     cusparseStruct->offdiagGPUMatFormat = format;
374     break;
375   case MAT_CUSPARSE_ALL:
376     cusparseStruct->diagGPUMatFormat    = format;
377     cusparseStruct->offdiagGPUMatFormat = format;
378     break;
379   default:
380     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
381   }
382   PetscFunctionReturn(0);
383 }
384 
385 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
386 {
387   MatCUSPARSEStorageFormat format;
388   PetscErrorCode           ierr;
389   PetscBool                flg;
390   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
391   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
392 
393   PetscFunctionBegin;
394   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
395   if (A->factortype==MAT_FACTOR_NONE) {
396     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
397                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
398     if (flg) {
399       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
400     }
401     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
402                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
403     if (flg) {
404       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
405     }
406     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
407                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
408     if (flg) {
409       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
410     }
411   }
412   ierr = PetscOptionsTail();CHKERRQ(ierr);
413   PetscFunctionReturn(0);
414 }
415 
416 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
417 {
418   PetscErrorCode             ierr;
419   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
420   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
421   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
422   PetscFunctionBegin;
423   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
424   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
425     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
426    #if defined(PETSC_HAVE_NVSHMEM)
427     {
428       PetscMPIInt result;
429       PetscBool   useNvshmem = PETSC_FALSE;
430       ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr);
431       if (useNvshmem) {
432         ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr);
433         if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);}
434       }
435     }
436    #endif
437   }
438   if (d_mat) {
439     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
440   }
441   PetscFunctionReturn(0);
442 }
443 
444 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
445 {
446   PetscErrorCode     ierr;
447   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
448   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
449   cudaError_t        err;
450   cusparseStatus_t   stat;
451 
452   PetscFunctionBegin;
453   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
454   if (cusparseStruct->deviceMat) {
455     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
456     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
457     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
458     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
459     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
460     if (jaca->compressedrow.use) {
461       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
462     }
463     if (jacb->compressedrow.use) {
464       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
465     }
466     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
467     err = cudaFree(d_mat);CHKERRCUDA(err);
468   }
469   try {
470     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
471     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
472     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
473     /* We want cusparseStruct to use PetscDefaultCudaStream
474     if (cusparseStruct->stream) {
475       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
476     }
477     */
478     delete cusparseStruct->coo_p;
479     delete cusparseStruct->coo_pw;
480     delete cusparseStruct;
481   } catch(char *ex) {
482     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
483   }
484   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
485   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
486   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
487   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
488   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
489   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
490   PetscFunctionReturn(0);
491 }
492 
493 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
494 {
495   PetscErrorCode     ierr;
496   Mat_MPIAIJ         *a;
497   Mat_MPIAIJCUSPARSE *cusparseStruct;
498   cusparseStatus_t   stat;
499   Mat                A;
500 
501   PetscFunctionBegin;
502   if (reuse == MAT_INITIAL_MATRIX) {
503     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
504   } else if (reuse == MAT_REUSE_MATRIX) {
505     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
506   }
507   A = *newmat;
508   A->boundtocpu = PETSC_FALSE;
509   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
510   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
511 
512   a = (Mat_MPIAIJ*)A->data;
513   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
514   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
515   if (a->lvec) {
516     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
517   }
518 
519   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
520     a->spptr = new Mat_MPIAIJCUSPARSE;
521 
522     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
523     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
524     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
525     cusparseStruct->coo_p               = NULL;
526     cusparseStruct->coo_pw              = NULL;
527     cusparseStruct->stream              = 0; /* We should not need cusparseStruct->stream */
528     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
529     stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
530     cusparseStruct->deviceMat = NULL;
531   }
532 
533   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
534   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
535   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
536   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
537   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
538   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
539   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
540   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
541 
542   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
543   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
544   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
545   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
546   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
547   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
548   PetscFunctionReturn(0);
549 }
550 
551 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
552 {
553   PetscErrorCode ierr;
554 
555   PetscFunctionBegin;
556   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
557   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
558   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
559   PetscFunctionReturn(0);
560 }
561 
562 /*@
563    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
564    (the default parallel PETSc format).  This matrix will ultimately pushed down
565    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
566    assembly performance the user should preallocate the matrix storage by setting
567    the parameter nz (or the array nnz).  By setting these parameters accurately,
568    performance during matrix assembly can be increased by more than a factor of 50.
569 
570    Collective
571 
572    Input Parameters:
573 +  comm - MPI communicator, set to PETSC_COMM_SELF
574 .  m - number of rows
575 .  n - number of columns
576 .  nz - number of nonzeros per row (same for all rows)
577 -  nnz - array containing the number of nonzeros in the various rows
578          (possibly different for each row) or NULL
579 
580    Output Parameter:
581 .  A - the matrix
582 
583    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
584    MatXXXXSetPreallocation() paradigm instead of this routine directly.
585    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
586 
587    Notes:
588    If nnz is given then nz is ignored
589 
590    The AIJ format (also called the Yale sparse matrix format or
591    compressed row storage), is fully compatible with standard Fortran 77
592    storage.  That is, the stored row and column indices can begin at
593    either one (as in Fortran) or zero.  See the users' manual for details.
594 
595    Specify the preallocated storage with either nz or nnz (not both).
596    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
597    allocation.  For large problems you MUST preallocate memory or you
598    will get TERRIBLE performance, see the users' manual chapter on matrices.
599 
600    By default, this format uses inodes (identical nodes) when possible, to
601    improve numerical efficiency of matrix-vector products and solves. We
602    search for consecutive rows with the same nonzero structure, thereby
603    reusing matrix information to achieve increased efficiency.
604 
605    Level: intermediate
606 
607 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
608 @*/
609 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
610 {
611   PetscErrorCode ierr;
612   PetscMPIInt    size;
613 
614   PetscFunctionBegin;
615   ierr = MatCreate(comm,A);CHKERRQ(ierr);
616   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
617   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
618   if (size > 1) {
619     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
620     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
621   } else {
622     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
623     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
624   }
625   PetscFunctionReturn(0);
626 }
627 
628 /*MC
629    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
630 
631    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
632    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
633    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
634 
635    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
636    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
637    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
638    for communicators controlling multiple processes.  It is recommended that you call both of
639    the above preallocation routines for simplicity.
640 
641    Options Database Keys:
642 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
643 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
644 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
645 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
646 
647   Level: beginner
648 
649  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
650 M
651 M*/
652 
653 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
654 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
655 {
656 #if defined(PETSC_USE_CTABLE)
657   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
658 #else
659   PetscSplitCSRDataStructure **p_d_mat;
660   PetscMPIInt                size,rank;
661   MPI_Comm                   comm;
662   PetscErrorCode             ierr;
663   int                        *ai,*bi,*aj,*bj;
664   PetscScalar                *aa,*ba;
665 
666   PetscFunctionBegin;
667   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
668   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
669   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
670   if (A->factortype == MAT_FACTOR_NONE) {
671     CsrMatrix *matrixA,*matrixB=NULL;
672     if (size == 1) {
673       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
674       p_d_mat = &cusparsestruct->deviceMat;
675       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
676       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
677         matrixA = (CsrMatrix*)matstruct->mat;
678         bi = bj = NULL; ba = NULL;
679       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
680     } else {
681       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
682       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
683       p_d_mat = &spptr->deviceMat;
684       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
685       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
686       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
687       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
688       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
689         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
690         matrixA = (CsrMatrix*)matstructA->mat;
691         matrixB = (CsrMatrix*)matstructB->mat;
692         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
693         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
694         ba = thrust::raw_pointer_cast(matrixB->values->data());
695       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
696     }
697     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
698     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
699     aa = thrust::raw_pointer_cast(matrixA->values->data());
700   } else {
701     *B = NULL;
702     PetscFunctionReturn(0);
703   }
704   // act like MatSetValues because not called on host
705   if (A->assembled) {
706     if (A->was_assembled) {
707       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
708     }
709     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
710   } else {
711     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
712   }
713   if (!*p_d_mat) {
714     cudaError_t                 err;
715     PetscSplitCSRDataStructure  *d_mat, h_mat;
716     Mat_SeqAIJ                  *jaca;
717     PetscInt                    n = A->rmap->n, nnz;
718     // create and copy
719     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
720     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
721     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
722     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
723     if (size == 1) {
724       jaca = (Mat_SeqAIJ*)A->data;
725       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
726       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
727       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
728       h_mat.offdiag.a = NULL;
729     } else {
730       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
731       Mat_SeqAIJ  *jacb;
732       jaca = (Mat_SeqAIJ*)aij->A->data;
733       jacb = (Mat_SeqAIJ*)aij->B->data;
734       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
735       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
736       // create colmap - this is ussually done (lazy) in MatSetValues
737       aij->donotstash = PETSC_TRUE;
738       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
739       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
740       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
741       aij->colmap[A->cmap->N] = -9;
742       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
743       {
744         PetscInt ii;
745         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
746       }
747       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
748       // allocate B copy data
749       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
750       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
751       nnz = jacb->i[n];
752 
753       if (jacb->compressedrow.use) {
754         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
755         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
756       } else h_mat.offdiag.i = bi;
757       h_mat.offdiag.j = bj;
758       h_mat.offdiag.a = ba;
759 
760       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
761       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
762       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
763       h_mat.offdiag.n = n;
764     }
765     // allocate A copy data
766     nnz = jaca->i[n];
767     h_mat.diag.n = n;
768     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
769     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
770     if (jaca->compressedrow.use) {
771       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
772       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
773     } else {
774       h_mat.diag.i = ai;
775     }
776     h_mat.diag.j = aj;
777     h_mat.diag.a = aa;
778     // copy pointers and metdata to device
779     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
780     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
781   } else {
782     *B = *p_d_mat;
783   }
784   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
785   PetscFunctionReturn(0);
786 #endif
787 }
788