xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 8a53a0a473cbd886d528d66811e6ee7fc8c83ebf)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28   cudaError_t        cerr;
29 
30   PetscFunctionBegin;
31   if (cusp->coo_p && v) {
32     thrust::device_ptr<const PetscScalar> d_v;
33     THRUSTARRAY                           *w = NULL;
34 
35     if (isCudaMem(v)) {
36       d_v = thrust::device_pointer_cast(v);
37     } else {
38       w = new THRUSTARRAY(n);
39       w->assign(v,v+n);
40       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
41       d_v = w->data();
42     }
43 
44     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
45                                                               cusp->coo_pw->begin()));
46     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
47                                                               cusp->coo_pw->end()));
48     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
49     thrust::for_each(zibit,zieit,VecCUDAEquals());
50     cerr = WaitForCUDA();CHKERRCUDA(cerr);
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               *jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n);
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
118   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
119   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
120   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
121   ierr = PetscFree(b->garray);CHKERRQ(ierr);
122   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
123   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
124   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
125 
126   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
127   d_i.assign(coo_i,coo_i+n);
128   d_j.assign(coo_j,coo_j+n);
129   delete cusp->coo_p;
130   delete cusp->coo_pw;
131   cusp->coo_p = NULL;
132   cusp->coo_pw = NULL;
133   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
134   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
135   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
136   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
137     cusp->coo_p = new THRUSTINTARRAY(n);
138     cusp->coo_pw = new THRUSTARRAY(n);
139     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
140     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
141     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
142     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
143     firstoffd = mzipp.get_iterator_tuple().get<1>();
144   }
145   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
146   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
147 
148   /* from global to local */
149   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
150   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
151   cerr = WaitForCUDA();CHKERRCUDA(cerr);
152   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
153 
154   /* copy offdiag column indices to map on the CPU */
155   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
156   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
157   auto o_j = d_j.begin();
158   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
159   thrust::advance(o_j,cusp->coo_nd);
160   thrust::sort(thrust::device,o_j,d_j.end());
161   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
162   cerr = WaitForCUDA();CHKERRCUDA(cerr);
163   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
164   noff = thrust::distance(o_j,wit);
165   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
166   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
167   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
168   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
169   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
170   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
171   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
172   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
173 
174   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
175   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
176   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
177   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
178   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
179   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
180   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
181   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
182 
183   /* GPU memory, cusparse specific call handles it internally */
184   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
185   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
186   ierr = PetscFree(jj);CHKERRQ(ierr);
187 
188   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
190   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
191   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
192   /*
193   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
194   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
195   */
196   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
197   B->preallocated = PETSC_TRUE;
198   B->nonzerostate++;
199 
200   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
201   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
202   B->offloadmask = PETSC_OFFLOAD_CPU;
203   B->assembled = PETSC_FALSE;
204   B->was_assembled = PETSC_FALSE;
205   PetscFunctionReturn(0);
206 }
207 
208 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
209 {
210   Mat            Ad,Ao;
211   const PetscInt *cmap;
212   PetscErrorCode ierr;
213 
214   PetscFunctionBegin;
215   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
216   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
217   if (glob) {
218     PetscInt cst, i, dn, on, *gidx;
219 
220     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
221     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
222     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
223     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
224     for (i=0; i<dn; i++) gidx[i]    = cst + i;
225     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
226     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
227   }
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
232 {
233   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
234   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
235   PetscErrorCode     ierr;
236   PetscInt           i;
237 
238   PetscFunctionBegin;
239   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
240   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
241   if (PetscDefined(USE_DEBUG) && d_nnz) {
242     for (i=0; i<B->rmap->n; i++) {
243       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
244     }
245   }
246   if (PetscDefined(USE_DEBUG) && o_nnz) {
247     for (i=0; i<B->rmap->n; i++) {
248       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
249     }
250   }
251 #if defined(PETSC_USE_CTABLE)
252   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
253 #else
254   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
255 #endif
256   ierr = PetscFree(b->garray);CHKERRQ(ierr);
257   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
258   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
259   /* Because the B will have been resized we simply destroy it and create a new one each time */
260   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
261   if (!b->A) {
262     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
263     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
264     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
265   }
266   if (!b->B) {
267     PetscMPIInt size;
268     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
269     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
270     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
271     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
272   }
273   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
274   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
275   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
276   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
277   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
278   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
279   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
281   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
282   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
283   /* Let A, B use b's handle with pre-set stream
284   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
285   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
286   */
287   B->preallocated = PETSC_TRUE;
288   PetscFunctionReturn(0);
289 }
290 
291 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
292 {
293   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
294   PetscErrorCode ierr;
295   PetscInt       nt;
296 
297   PetscFunctionBegin;
298   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
299   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
300   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
301   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
302   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
303   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
304   PetscFunctionReturn(0);
305 }
306 
307 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
308 {
309   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
310   PetscErrorCode ierr;
311 
312   PetscFunctionBegin;
313   if (A->factortype == MAT_FACTOR_NONE) {
314     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
315     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
316     if (d_mat) {
317       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
318       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
319       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
320       cudaError_t  err;
321       PetscScalar  *vals;
322       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
323       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
324       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
325       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
326       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
327     }
328   }
329   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
330   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
331 
332   PetscFunctionReturn(0);
333 }
334 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
335 {
336   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
337   PetscErrorCode ierr;
338   PetscInt       nt;
339 
340   PetscFunctionBegin;
341   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
342   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
343   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
344   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
345   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
346   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
347   PetscFunctionReturn(0);
348 }
349 
350 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
351 {
352   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
353   PetscErrorCode ierr;
354   PetscInt       nt;
355 
356   PetscFunctionBegin;
357   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
358   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
359   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
360   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
361   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
362   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
363   PetscFunctionReturn(0);
364 }
365 
366 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
367 {
368   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
369   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
370 
371   PetscFunctionBegin;
372   switch (op) {
373   case MAT_CUSPARSE_MULT_DIAG:
374     cusparseStruct->diagGPUMatFormat = format;
375     break;
376   case MAT_CUSPARSE_MULT_OFFDIAG:
377     cusparseStruct->offdiagGPUMatFormat = format;
378     break;
379   case MAT_CUSPARSE_ALL:
380     cusparseStruct->diagGPUMatFormat    = format;
381     cusparseStruct->offdiagGPUMatFormat = format;
382     break;
383   default:
384     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
385   }
386   PetscFunctionReturn(0);
387 }
388 
389 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
390 {
391   MatCUSPARSEStorageFormat format;
392   PetscErrorCode           ierr;
393   PetscBool                flg;
394   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
395   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
396 
397   PetscFunctionBegin;
398   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
399   if (A->factortype==MAT_FACTOR_NONE) {
400     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
401                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
402     if (flg) {
403       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
404     }
405     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
406                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
407     if (flg) {
408       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
409     }
410     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
411                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
412     if (flg) {
413       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
414     }
415   }
416   ierr = PetscOptionsTail();CHKERRQ(ierr);
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
421 {
422   PetscErrorCode             ierr;
423   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
424   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
425   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
426   PetscFunctionBegin;
427   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
428   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
429     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
430    #if defined(PETSC_HAVE_NVSHMEM)
431     {
432       PetscMPIInt result;
433       PetscBool   useNvshmem = PETSC_FALSE;
434       ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr);
435       if (useNvshmem) {
436         ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr);
437         if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);}
438       }
439     }
440    #endif
441   }
442   if (d_mat) {
443     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
444   }
445   PetscFunctionReturn(0);
446 }
447 
448 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
449 {
450   PetscErrorCode     ierr;
451   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
452   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
453   cudaError_t        err;
454   cusparseStatus_t   stat;
455 
456   PetscFunctionBegin;
457   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
458   if (cusparseStruct->deviceMat) {
459     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
460     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
461     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
462     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
463     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
464     if (jaca->compressedrow.use) {
465       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
466     }
467     if (jacb->compressedrow.use) {
468       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
469     }
470     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
471     err = cudaFree(d_mat);CHKERRCUDA(err);
472   }
473   try {
474     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
475     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
476     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
477     /* We want cusparseStruct to use PetscDefaultCudaStream
478     if (cusparseStruct->stream) {
479       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
480     }
481     */
482     delete cusparseStruct->coo_p;
483     delete cusparseStruct->coo_pw;
484     delete cusparseStruct;
485   } catch(char *ex) {
486     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
487   }
488   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
489   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
490   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
491   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
492   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
493   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
494   PetscFunctionReturn(0);
495 }
496 
497 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
498 {
499   PetscErrorCode     ierr;
500   Mat_MPIAIJ         *a;
501   Mat_MPIAIJCUSPARSE *cusparseStruct;
502   cusparseStatus_t   stat;
503   Mat                A;
504 
505   PetscFunctionBegin;
506   if (reuse == MAT_INITIAL_MATRIX) {
507     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
508   } else if (reuse == MAT_REUSE_MATRIX) {
509     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
510   }
511   A = *newmat;
512   A->boundtocpu = PETSC_FALSE;
513   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
514   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
515 
516   a = (Mat_MPIAIJ*)A->data;
517   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
518   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
519   if (a->lvec) {
520     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
521   }
522 
523   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
524     a->spptr = new Mat_MPIAIJCUSPARSE;
525 
526     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
527     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
528     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
529     cusparseStruct->coo_p               = NULL;
530     cusparseStruct->coo_pw              = NULL;
531     cusparseStruct->stream              = 0; /* We should not need cusparseStruct->stream */
532     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
533     stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
534     cusparseStruct->deviceMat = NULL;
535   }
536 
537   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
538   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
539   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
540   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
541   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
542   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
543   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
544   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
545 
546   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
547   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
548   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
549   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
550   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
551   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
552   PetscFunctionReturn(0);
553 }
554 
555 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
556 {
557   PetscErrorCode ierr;
558 
559   PetscFunctionBegin;
560   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
561   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
562   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
563   PetscFunctionReturn(0);
564 }
565 
566 /*@
567    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
568    (the default parallel PETSc format).  This matrix will ultimately pushed down
569    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
570    assembly performance the user should preallocate the matrix storage by setting
571    the parameter nz (or the array nnz).  By setting these parameters accurately,
572    performance during matrix assembly can be increased by more than a factor of 50.
573 
574    Collective
575 
576    Input Parameters:
577 +  comm - MPI communicator, set to PETSC_COMM_SELF
578 .  m - number of rows
579 .  n - number of columns
580 .  nz - number of nonzeros per row (same for all rows)
581 -  nnz - array containing the number of nonzeros in the various rows
582          (possibly different for each row) or NULL
583 
584    Output Parameter:
585 .  A - the matrix
586 
587    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
588    MatXXXXSetPreallocation() paradigm instead of this routine directly.
589    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
590 
591    Notes:
592    If nnz is given then nz is ignored
593 
594    The AIJ format (also called the Yale sparse matrix format or
595    compressed row storage), is fully compatible with standard Fortran 77
596    storage.  That is, the stored row and column indices can begin at
597    either one (as in Fortran) or zero.  See the users' manual for details.
598 
599    Specify the preallocated storage with either nz or nnz (not both).
600    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
601    allocation.  For large problems you MUST preallocate memory or you
602    will get TERRIBLE performance, see the users' manual chapter on matrices.
603 
604    By default, this format uses inodes (identical nodes) when possible, to
605    improve numerical efficiency of matrix-vector products and solves. We
606    search for consecutive rows with the same nonzero structure, thereby
607    reusing matrix information to achieve increased efficiency.
608 
609    Level: intermediate
610 
611 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
612 @*/
613 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
614 {
615   PetscErrorCode ierr;
616   PetscMPIInt    size;
617 
618   PetscFunctionBegin;
619   ierr = MatCreate(comm,A);CHKERRQ(ierr);
620   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
621   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
622   if (size > 1) {
623     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
624     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
625   } else {
626     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
627     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 /*MC
633    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
634 
635    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
636    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
637    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
638 
639    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
640    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
641    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
642    for communicators controlling multiple processes.  It is recommended that you call both of
643    the above preallocation routines for simplicity.
644 
645    Options Database Keys:
646 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
647 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
648 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
649 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
650 
651   Level: beginner
652 
653  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
654 M
655 M*/
656 
657 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
658 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
659 {
660 #if defined(PETSC_USE_CTABLE)
661   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
662 #else
663   PetscSplitCSRDataStructure **p_d_mat;
664   PetscMPIInt                size,rank;
665   MPI_Comm                   comm;
666   PetscErrorCode             ierr;
667   int                        *ai,*bi,*aj,*bj;
668   PetscScalar                *aa,*ba;
669 
670   PetscFunctionBegin;
671   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
672   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
673   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
674   if (A->factortype == MAT_FACTOR_NONE) {
675     CsrMatrix *matrixA,*matrixB=NULL;
676     if (size == 1) {
677       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
678       p_d_mat = &cusparsestruct->deviceMat;
679       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
680       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
681         matrixA = (CsrMatrix*)matstruct->mat;
682         bi = bj = NULL; ba = NULL;
683       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
684     } else {
685       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
686       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
687       p_d_mat = &spptr->deviceMat;
688       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
689       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
690       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
691       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
692       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
693         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
694         matrixA = (CsrMatrix*)matstructA->mat;
695         matrixB = (CsrMatrix*)matstructB->mat;
696         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
697         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
698         ba = thrust::raw_pointer_cast(matrixB->values->data());
699       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
700     }
701     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
702     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
703     aa = thrust::raw_pointer_cast(matrixA->values->data());
704   } else {
705     *B = NULL;
706     PetscFunctionReturn(0);
707   }
708   // act like MatSetValues because not called on host
709   if (A->assembled) {
710     if (A->was_assembled) {
711       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
712     }
713     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
714   } else {
715     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
716   }
717   if (!*p_d_mat) {
718     cudaError_t                 err;
719     PetscSplitCSRDataStructure  *d_mat, h_mat;
720     Mat_SeqAIJ                  *jaca;
721     PetscInt                    n = A->rmap->n, nnz;
722     // create and copy
723     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
724     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
725     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
726     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
727     if (size == 1) {
728       jaca = (Mat_SeqAIJ*)A->data;
729       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
730       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
731       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
732       h_mat.offdiag.a = NULL;
733     } else {
734       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
735       Mat_SeqAIJ  *jacb;
736       jaca = (Mat_SeqAIJ*)aij->A->data;
737       jacb = (Mat_SeqAIJ*)aij->B->data;
738       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
739       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
740       // create colmap - this is ussually done (lazy) in MatSetValues
741       aij->donotstash = PETSC_TRUE;
742       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
743       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
744       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
745       aij->colmap[A->cmap->N] = -9;
746       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
747       {
748         PetscInt ii;
749         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
750       }
751       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
752       // allocate B copy data
753       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
754       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
755       nnz = jacb->i[n];
756 
757       if (jacb->compressedrow.use) {
758         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
759         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
760       } else h_mat.offdiag.i = bi;
761       h_mat.offdiag.j = bj;
762       h_mat.offdiag.a = ba;
763 
764       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
765       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
766       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
767       h_mat.offdiag.n = n;
768     }
769     // allocate A copy data
770     nnz = jaca->i[n];
771     h_mat.diag.n = n;
772     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
773     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
774     if (jaca->compressedrow.use) {
775       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
776       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
777     } else {
778       h_mat.diag.i = ai;
779     }
780     h_mat.diag.j = aj;
781     h_mat.diag.a = aa;
782     // copy pointers and metdata to device
783     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
784     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
785   } else {
786     *B = *p_d_mat;
787   }
788   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
789   PetscFunctionReturn(0);
790 #endif
791 }
792