xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2e5835c6908701226356db06d805ef70170dbd55) !
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 
11 struct VecCUDAEquals
12 {
13   template <typename Tuple>
14   __host__ __device__
15   void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
22 {
23   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
24   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
25   PetscInt           n = cusp->coo_nd + cusp->coo_no;
26   PetscErrorCode     ierr;
27   cudaError_t        cerr;
28 
29   PetscFunctionBegin;
30   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
31   if (cusp->coo_p && v) {
32     thrust::device_ptr<const PetscScalar> d_v;
33     THRUSTARRAY                           *w = NULL;
34 
35     if (isCudaMem(v)) {
36       d_v = thrust::device_pointer_cast(v);
37     } else {
38       w = new THRUSTARRAY(n);
39       w->assign(v,v+n);
40       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
41       d_v = w->data();
42     }
43 
44     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
45                                                               cusp->coo_pw->begin()));
46     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
47                                                               cusp->coo_pw->end()));
48     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
49     thrust::for_each(zibit,zieit,VecCUDAEquals());
50     cerr = WaitForCUDA();CHKERRCUDA(cerr);
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
60   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
61   A->num_ass++;
62   A->assembled        = PETSC_TRUE;
63   A->ass_nonzerostate = A->nonzerostate;
64   A->offloadmask      = PETSC_OFFLOAD_BOTH;
65   PetscFunctionReturn(0);
66 }
67 
68 template <typename Tuple>
69 struct IsNotOffDiagT
70 {
71   PetscInt _cstart,_cend;
72 
73   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
74   __host__ __device__
75   inline bool operator()(Tuple t)
76   {
77     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
78   }
79 };
80 
81 struct IsOffDiag
82 {
83   PetscInt _cstart,_cend;
84 
85   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
86   __host__ __device__
87   inline bool operator() (const PetscInt &c)
88   {
89     return c < _cstart || c >= _cend;
90   }
91 };
92 
93 struct GlobToLoc
94 {
95   PetscInt _start;
96 
97   GlobToLoc(PetscInt start) : _start(start) {}
98   __host__ __device__
99   inline PetscInt operator() (const PetscInt &c)
100   {
101     return c - _start;
102   }
103 };
104 
105 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
106 {
107   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
108   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
109   PetscErrorCode         ierr;
110   PetscInt               *jj;
111   size_t                 noff = 0;
112   THRUSTINTARRAY         d_i(n);
113   THRUSTINTARRAY         d_j(n);
114   ISLocalToGlobalMapping l2g;
115   cudaError_t            cerr;
116 
117   PetscFunctionBegin;
118   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
119   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
120   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
121   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
122   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
123   ierr = PetscFree(b->garray);CHKERRQ(ierr);
124   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
125   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
126   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
127 
128   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
129   d_i.assign(coo_i,coo_i+n);
130   d_j.assign(coo_j,coo_j+n);
131   delete cusp->coo_p;
132   delete cusp->coo_pw;
133   cusp->coo_p = NULL;
134   cusp->coo_pw = NULL;
135   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
136   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
137   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
138   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
139     cusp->coo_p = new THRUSTINTARRAY(n);
140     cusp->coo_pw = new THRUSTARRAY(n);
141     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
142     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
143     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
144     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
145     firstoffd = mzipp.get_iterator_tuple().get<1>();
146   }
147   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
148   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
149 
150   /* from global to local */
151   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
152   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
153   cerr = WaitForCUDA();CHKERRCUDA(cerr);
154   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
155 
156   /* copy offdiag column indices to map on the CPU */
157   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
158   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
159   auto o_j = d_j.begin();
160   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
161   thrust::advance(o_j,cusp->coo_nd);
162   thrust::sort(thrust::device,o_j,d_j.end());
163   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
164   cerr = WaitForCUDA();CHKERRCUDA(cerr);
165   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
166   noff = thrust::distance(o_j,wit);
167   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
168   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
169   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
170   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
171   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
172   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
173   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
174   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
175 
176   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
177   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
178   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
179   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
180   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
181   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
182   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
183   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
184 
185   /* GPU memory, cusparse specific call handles it internally */
186   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
187   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
188   ierr = PetscFree(jj);CHKERRQ(ierr);
189 
190   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
191   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
192   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
193   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
194   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
195   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
196   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
197   B->preallocated = PETSC_TRUE;
198   B->nonzerostate++;
199   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
200 
201   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
202   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
203   B->offloadmask = PETSC_OFFLOAD_CPU;
204   B->assembled = PETSC_FALSE;
205   B->was_assembled = PETSC_FALSE;
206   PetscFunctionReturn(0);
207 }
208 
209 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
210 {
211   Mat            Ad,Ao;
212   const PetscInt *cmap;
213   PetscErrorCode ierr;
214 
215   PetscFunctionBegin;
216   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
217   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
218   if (glob) {
219     PetscInt cst, i, dn, on, *gidx;
220 
221     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
222     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
223     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
224     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
225     for (i=0; i<dn; i++) gidx[i]    = cst + i;
226     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
227     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
228   }
229   PetscFunctionReturn(0);
230 }
231 
232 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
233 {
234   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
235   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
236   PetscErrorCode     ierr;
237   PetscInt           i;
238 
239   PetscFunctionBegin;
240   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
241   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
242   if (PetscDefined(USE_DEBUG) && d_nnz) {
243     for (i=0; i<B->rmap->n; i++) {
244       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
245     }
246   }
247   if (PetscDefined(USE_DEBUG) && o_nnz) {
248     for (i=0; i<B->rmap->n; i++) {
249       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
250     }
251   }
252   if (!B->preallocated) {
253     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
254     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
255     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
256     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
257     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
258     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
259     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
260     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
261     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
262   }
263   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
264   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
265   if (b->lvec) {
266     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
267   }
268   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
269   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
270   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
271   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
272   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
273   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
274   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
276 
277   B->preallocated = PETSC_TRUE;
278   PetscFunctionReturn(0);
279 }
280 
281 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
282 {
283   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
284   PetscErrorCode ierr;
285   PetscInt       nt;
286 
287   PetscFunctionBegin;
288   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
289   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
290   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
291   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
292   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
293   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
294   PetscFunctionReturn(0);
295 }
296 
297 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
298 {
299   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
300   PetscErrorCode ierr;
301 
302   PetscFunctionBegin;
303   if (A->factortype == MAT_FACTOR_NONE) {
304     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
305     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
306     if (d_mat) {
307       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
308       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
309       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
310       cudaError_t  err;
311       PetscScalar  *vals;
312       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
313       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
314       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
315       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
316       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
317     }
318   }
319   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
320   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
321 
322   PetscFunctionReturn(0);
323 }
324 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
325 {
326   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
327   PetscErrorCode ierr;
328   PetscInt       nt;
329 
330   PetscFunctionBegin;
331   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
332   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
333   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
334   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
335   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
336   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
337   PetscFunctionReturn(0);
338 }
339 
340 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
341 {
342   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
343   PetscErrorCode ierr;
344   PetscInt       nt;
345 
346   PetscFunctionBegin;
347   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
348   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
349   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
350   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
351   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
352   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
357 {
358   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
359   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
360 
361   PetscFunctionBegin;
362   switch (op) {
363   case MAT_CUSPARSE_MULT_DIAG:
364     cusparseStruct->diagGPUMatFormat = format;
365     break;
366   case MAT_CUSPARSE_MULT_OFFDIAG:
367     cusparseStruct->offdiagGPUMatFormat = format;
368     break;
369   case MAT_CUSPARSE_ALL:
370     cusparseStruct->diagGPUMatFormat    = format;
371     cusparseStruct->offdiagGPUMatFormat = format;
372     break;
373   default:
374     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
375   }
376   PetscFunctionReturn(0);
377 }
378 
379 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
380 {
381   MatCUSPARSEStorageFormat format;
382   PetscErrorCode           ierr;
383   PetscBool                flg;
384   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
385   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
386 
387   PetscFunctionBegin;
388   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
389   if (A->factortype==MAT_FACTOR_NONE) {
390     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
391                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
392     if (flg) {
393       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
394     }
395     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
396                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
397     if (flg) {
398       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
399     }
400     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
401                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
402     if (flg) {
403       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
404     }
405   }
406   ierr = PetscOptionsTail();CHKERRQ(ierr);
407   PetscFunctionReturn(0);
408 }
409 
410 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
411 {
412   PetscErrorCode             ierr;
413   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
414   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
415   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
416   PetscFunctionBegin;
417   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
418   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
419     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
420   }
421   if (d_mat) {
422     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
423   }
424 
425   PetscFunctionReturn(0);
426 }
427 
428 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
429 {
430   PetscErrorCode     ierr;
431   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
432   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
433   cudaError_t        err;
434   cusparseStatus_t   stat;
435 
436   PetscFunctionBegin;
437   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
438   if (cusparseStruct->deviceMat) {
439     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
440     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
441     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
442     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
443     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
444     if (jaca->compressedrow.use) {
445       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
446     }
447     if (jacb->compressedrow.use) {
448       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
449     }
450     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
451     err = cudaFree(d_mat);CHKERRCUDA(err);
452   }
453   try {
454     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
455     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
456     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
457     if (cusparseStruct->stream) {
458       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
459     }
460     delete cusparseStruct->coo_p;
461     delete cusparseStruct->coo_pw;
462     delete cusparseStruct;
463   } catch(char *ex) {
464     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
465   }
466   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
467   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
468   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
469   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
470   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
471   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
472   PetscFunctionReturn(0);
473 }
474 
475 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
476 {
477   PetscErrorCode     ierr;
478   Mat_MPIAIJ         *a;
479   Mat_MPIAIJCUSPARSE *cusparseStruct;
480   cusparseStatus_t   stat;
481   Mat                A;
482 
483   PetscFunctionBegin;
484   if (reuse == MAT_INITIAL_MATRIX) {
485     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
486   } else if (reuse == MAT_REUSE_MATRIX) {
487     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
488   }
489   A = *newmat;
490 
491   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
492   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
493 
494   a = (Mat_MPIAIJ*)A->data;
495   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
496   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
497   if (a->lvec) {
498     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
499   }
500 
501   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
502     a->spptr = new Mat_MPIAIJCUSPARSE;
503 
504     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
505     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
506     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
507     cusparseStruct->coo_p               = NULL;
508     cusparseStruct->coo_pw              = NULL;
509     cusparseStruct->stream              = 0;
510     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
511     cusparseStruct->deviceMat = NULL;
512   }
513 
514   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
515   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
516   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
517   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
518   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
519   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
520   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
521 
522   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
523   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
524   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
525   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
526   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
527   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 
531 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
532 {
533   PetscErrorCode ierr;
534 
535   PetscFunctionBegin;
536   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
537   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
538   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
539   PetscFunctionReturn(0);
540 }
541 
542 /*@
543    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
544    (the default parallel PETSc format).  This matrix will ultimately pushed down
545    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
546    assembly performance the user should preallocate the matrix storage by setting
547    the parameter nz (or the array nnz).  By setting these parameters accurately,
548    performance during matrix assembly can be increased by more than a factor of 50.
549 
550    Collective
551 
552    Input Parameters:
553 +  comm - MPI communicator, set to PETSC_COMM_SELF
554 .  m - number of rows
555 .  n - number of columns
556 .  nz - number of nonzeros per row (same for all rows)
557 -  nnz - array containing the number of nonzeros in the various rows
558          (possibly different for each row) or NULL
559 
560    Output Parameter:
561 .  A - the matrix
562 
563    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
564    MatXXXXSetPreallocation() paradigm instead of this routine directly.
565    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
566 
567    Notes:
568    If nnz is given then nz is ignored
569 
570    The AIJ format (also called the Yale sparse matrix format or
571    compressed row storage), is fully compatible with standard Fortran 77
572    storage.  That is, the stored row and column indices can begin at
573    either one (as in Fortran) or zero.  See the users' manual for details.
574 
575    Specify the preallocated storage with either nz or nnz (not both).
576    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
577    allocation.  For large problems you MUST preallocate memory or you
578    will get TERRIBLE performance, see the users' manual chapter on matrices.
579 
580    By default, this format uses inodes (identical nodes) when possible, to
581    improve numerical efficiency of matrix-vector products and solves. We
582    search for consecutive rows with the same nonzero structure, thereby
583    reusing matrix information to achieve increased efficiency.
584 
585    Level: intermediate
586 
587 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
588 @*/
589 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
590 {
591   PetscErrorCode ierr;
592   PetscMPIInt    size;
593 
594   PetscFunctionBegin;
595   ierr = MatCreate(comm,A);CHKERRQ(ierr);
596   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
597   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
598   if (size > 1) {
599     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
600     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
601   } else {
602     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
603     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
604   }
605   PetscFunctionReturn(0);
606 }
607 
608 /*MC
609    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
610 
611    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
612    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
613    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
614 
615    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
616    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
617    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
618    for communicators controlling multiple processes.  It is recommended that you call both of
619    the above preallocation routines for simplicity.
620 
621    Options Database Keys:
622 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
623 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
624 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
625 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
626 
627   Level: beginner
628 
629  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
630 M
631 M*/
632 
633 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
634 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
635 {
636 #if defined(PETSC_USE_CTABLE)
637   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
638 #else
639   PetscSplitCSRDataStructure **p_d_mat;
640   PetscMPIInt                size,rank;
641   MPI_Comm                   comm;
642   PetscErrorCode             ierr;
643   int                        *ai,*bi,*aj,*bj;
644   PetscScalar                *aa,*ba;
645 
646   PetscFunctionBegin;
647   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
648   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
649   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
650   if (A->factortype == MAT_FACTOR_NONE) {
651     CsrMatrix *matrixA,*matrixB=NULL;
652     if (size == 1) {
653       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
654       p_d_mat = &cusparsestruct->deviceMat;
655       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
656       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
657         matrixA = (CsrMatrix*)matstruct->mat;
658         bi = bj = NULL; ba = NULL;
659       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
660     } else {
661       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
662       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
663       p_d_mat = &spptr->deviceMat;
664       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
665       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
666       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
667       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
668       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
669         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
670         matrixA = (CsrMatrix*)matstructA->mat;
671         matrixB = (CsrMatrix*)matstructB->mat;
672         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
673         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
674         ba = thrust::raw_pointer_cast(matrixB->values->data());
675       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
676     }
677     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
678     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
679     aa = thrust::raw_pointer_cast(matrixA->values->data());
680   } else {
681     *B = NULL;
682     PetscFunctionReturn(0);
683   }
684   // act like MatSetValues because not called on host
685   if (A->assembled) {
686     if (A->was_assembled) {
687       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
688     }
689     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
690   } else {
691     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
692   }
693   if (!*p_d_mat) {
694     cudaError_t                 err;
695     PetscSplitCSRDataStructure  *d_mat, h_mat;
696     Mat_SeqAIJ                  *jaca;
697     PetscInt                    n = A->rmap->n, nnz;
698     // create and copy
699     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
700     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
701     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
702     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
703     if (size == 1) {
704       jaca = (Mat_SeqAIJ*)A->data;
705       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
706       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
707       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
708       h_mat.offdiag.a = NULL;
709     } else {
710       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
711       Mat_SeqAIJ  *jacb;
712       jaca = (Mat_SeqAIJ*)aij->A->data;
713       jacb = (Mat_SeqAIJ*)aij->B->data;
714       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
715       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
716       // create colmap - this is ussually done (lazy) in MatSetValues
717       aij->donotstash = PETSC_TRUE;
718       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
719       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
720       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
721       aij->colmap[A->cmap->N] = -9;
722       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
723       {
724         PetscInt ii;
725         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
726       }
727       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
728       // allocate B copy data
729       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
730       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
731       nnz = jacb->i[n];
732 
733       if (jacb->compressedrow.use) {
734         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
735         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
736       } else h_mat.offdiag.i = bi;
737       h_mat.offdiag.j = bj;
738       h_mat.offdiag.a = ba;
739 
740       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
741       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
742       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
743       h_mat.offdiag.n = n;
744     }
745     // allocate A copy data
746     nnz = jaca->i[n];
747     h_mat.diag.n = n;
748     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
749     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
750     if (jaca->compressedrow.use) {
751       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
752       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
753     } else {
754       h_mat.diag.i = ai;
755     }
756     h_mat.diag.j = aj;
757     h_mat.diag.a = aa;
758     // copy pointers and metdata to device
759     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
760     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
761   } else {
762     *B = *p_d_mat;
763   }
764   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
765   PetscFunctionReturn(0);
766 #endif
767 }
768