xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 70a7d78aacfbd24b2e31399a3d8e056944bb7de3)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <petscsf.h>
10 
11 struct VecCUDAEquals
12 {
13   template <typename Tuple>
14   __host__ __device__
15   void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
22 {
23   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
24   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
25   PetscInt           n = cusp->coo_nd + cusp->coo_no;
26   PetscErrorCode     ierr;
27 
28   PetscFunctionBegin;
29   if (cusp->coo_p && v) {
30     thrust::device_ptr<const PetscScalar> d_v;
31     THRUSTARRAY                           *w = NULL;
32 
33     if (isCudaMem(v)) {
34       d_v = thrust::device_pointer_cast(v);
35     } else {
36       w = new THRUSTARRAY(n);
37       w->assign(v,v+n);
38       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
39       d_v = w->data();
40     }
41 
42     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
43                                                               cusp->coo_pw->begin()));
44     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
45                                                               cusp->coo_pw->end()));
46     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
47     thrust::for_each(zibit,zieit,VecCUDAEquals());
48     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
49     delete w;
50     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
51     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
52   } else {
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
55   }
56   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
57   A->num_ass++;
58   A->assembled        = PETSC_TRUE;
59   A->ass_nonzerostate = A->nonzerostate;
60   A->offloadmask      = PETSC_OFFLOAD_GPU;
61   PetscFunctionReturn(0);
62 }
63 
64 template <typename Tuple>
65 struct IsNotOffDiagT
66 {
67   PetscInt _cstart,_cend;
68 
69   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
70   __host__ __device__
71   inline bool operator()(Tuple t)
72   {
73     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
74   }
75 };
76 
77 struct IsOffDiag
78 {
79   PetscInt _cstart,_cend;
80 
81   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
82   __host__ __device__
83   inline bool operator() (const PetscInt &c)
84   {
85     return c < _cstart || c >= _cend;
86   }
87 };
88 
89 struct GlobToLoc
90 {
91   PetscInt _start;
92 
93   GlobToLoc(PetscInt start) : _start(start) {}
94   __host__ __device__
95   inline PetscInt operator() (const PetscInt &c)
96   {
97     return c - _start;
98   }
99 };
100 
101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
102 {
103   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
104   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
105   PetscErrorCode         ierr;
106   PetscInt               *jj;
107   size_t                 noff = 0;
108   THRUSTINTARRAY         d_i(n);
109   THRUSTINTARRAY         d_j(n);
110   ISLocalToGlobalMapping l2g;
111   cudaError_t            cerr;
112 
113   PetscFunctionBegin;
114   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
115   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
116   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
117   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
118   ierr = PetscFree(b->garray);CHKERRQ(ierr);
119   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
120   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
122 
123   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
124   d_i.assign(coo_i,coo_i+n);
125   d_j.assign(coo_j,coo_j+n);
126   delete cusp->coo_p;
127   delete cusp->coo_pw;
128   cusp->coo_p = NULL;
129   cusp->coo_pw = NULL;
130   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
131   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
132   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
134     cusp->coo_p = new THRUSTINTARRAY(n);
135     cusp->coo_pw = new THRUSTARRAY(n);
136     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
137     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
138     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
139     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
140     firstoffd = mzipp.get_iterator_tuple().get<1>();
141   }
142   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
143   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
144 
145   /* from global to local */
146   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
147   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
148   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
149 
150   /* copy offdiag column indices to map on the CPU */
151   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
152   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153   auto o_j = d_j.begin();
154   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155   thrust::advance(o_j,cusp->coo_nd);
156   thrust::sort(thrust::device,o_j,d_j.end());
157   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
158   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
159   noff = thrust::distance(o_j,wit);
160   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
161   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
162   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
163   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
165   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
166   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
167   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
168 
169   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
170   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
171   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
172   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
173   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
174   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
175   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
176   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
177 
178   /* GPU memory, cusparse specific call handles it internally */
179   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
181   ierr = PetscFree(jj);CHKERRQ(ierr);
182 
183   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
184   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
187   /*
188   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
190   */
191   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
192   B->preallocated = PETSC_TRUE;
193   B->nonzerostate++;
194 
195   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
196   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
197   B->offloadmask = PETSC_OFFLOAD_CPU;
198   B->assembled = PETSC_FALSE;
199   B->was_assembled = PETSC_FALSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204 {
205   Mat            Ad,Ao;
206   const PetscInt *cmap;
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
211   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
212   if (glob) {
213     PetscInt cst, i, dn, on, *gidx;
214 
215     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
216     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
217     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
218     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
219     for (i=0; i<dn; i++) gidx[i]    = cst + i;
220     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
227 {
228   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
229   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
230   PetscErrorCode     ierr;
231   PetscInt           i;
232 
233   PetscFunctionBegin;
234   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
235   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
236   if (PetscDefined(USE_DEBUG) && d_nnz) {
237     for (i=0; i<B->rmap->n; i++) {
238       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239     }
240   }
241   if (PetscDefined(USE_DEBUG) && o_nnz) {
242     for (i=0; i<B->rmap->n; i++) {
243       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
244     }
245   }
246 #if defined(PETSC_USE_CTABLE)
247   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
248 #else
249   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
250 #endif
251   ierr = PetscFree(b->garray);CHKERRQ(ierr);
252   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
253   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
254   /* Because the B will have been resized we simply destroy it and create a new one each time */
255   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
256   if (!b->A) {
257     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
258     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
259     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
260   }
261   if (!b->B) {
262     PetscMPIInt size;
263     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
264     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
265     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
266     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
267   }
268   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
269   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
270   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
271   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
272   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
273   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
274   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
276   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
278   /* Let A, B use b's handle with pre-set stream
279   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
281   */
282   B->preallocated = PETSC_TRUE;
283   PetscFunctionReturn(0);
284 }
285 
286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
287 {
288   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
289   PetscErrorCode ierr;
290   PetscInt       nt;
291 
292   PetscFunctionBegin;
293   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
294   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
296   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
297   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
298   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
303 {
304   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
305   PetscErrorCode ierr;
306 
307   PetscFunctionBegin;
308   if (A->factortype == MAT_FACTOR_NONE) {
309     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
310     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
311     if (d_mat) {
312       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
313       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
314       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
315       cudaError_t  err;
316       PetscScalar  *vals;
317       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
318       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
319       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
320       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
321       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
322     }
323   }
324   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
325   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
326 
327   PetscFunctionReturn(0);
328 }
329 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
330 {
331   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
332   PetscErrorCode ierr;
333   PetscInt       nt;
334 
335   PetscFunctionBegin;
336   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
337   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
338   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
339   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
340   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
341   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
342   PetscFunctionReturn(0);
343 }
344 
345 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
346 {
347   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
348   PetscErrorCode ierr;
349   PetscInt       nt;
350 
351   PetscFunctionBegin;
352   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
353   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
354   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
355   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
356   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
357   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
358   PetscFunctionReturn(0);
359 }
360 
361 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
362 {
363   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
364   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
365 
366   PetscFunctionBegin;
367   switch (op) {
368   case MAT_CUSPARSE_MULT_DIAG:
369     cusparseStruct->diagGPUMatFormat = format;
370     break;
371   case MAT_CUSPARSE_MULT_OFFDIAG:
372     cusparseStruct->offdiagGPUMatFormat = format;
373     break;
374   case MAT_CUSPARSE_ALL:
375     cusparseStruct->diagGPUMatFormat    = format;
376     cusparseStruct->offdiagGPUMatFormat = format;
377     break;
378   default:
379     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
380   }
381   PetscFunctionReturn(0);
382 }
383 
384 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
385 {
386   MatCUSPARSEStorageFormat format;
387   PetscErrorCode           ierr;
388   PetscBool                flg;
389   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
390   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
391 
392   PetscFunctionBegin;
393   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
394   if (A->factortype==MAT_FACTOR_NONE) {
395     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
396                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
397     if (flg) {
398       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
399     }
400     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
401                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
402     if (flg) {
403       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
404     }
405     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
406                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
407     if (flg) {
408       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
409     }
410   }
411   ierr = PetscOptionsTail();CHKERRQ(ierr);
412   PetscFunctionReturn(0);
413 }
414 
415 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
416 {
417   PetscErrorCode             ierr;
418   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
419   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
420   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
421   PetscFunctionBegin;
422   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
423   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
424     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
425    #if defined(PETSC_HAVE_NVSHMEM)
426     {
427       PetscMPIInt result;
428       PetscBool   useNvshmem = PETSC_FALSE;
429       ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr);
430       if (useNvshmem) {
431         ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr);
432         if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);}
433       }
434     }
435    #endif
436   }
437   if (d_mat) {
438     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
439   }
440   PetscFunctionReturn(0);
441 }
442 
443 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
444 {
445   PetscErrorCode     ierr;
446   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
447   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
448   cudaError_t        err;
449   cusparseStatus_t   stat;
450 
451   PetscFunctionBegin;
452   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
453   if (cusparseStruct->deviceMat) {
454     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
455     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
456     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
457     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
458     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
459     if (jaca->compressedrow.use) {
460       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
461     }
462     if (jacb->compressedrow.use) {
463       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
464     }
465     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
466     err = cudaFree(d_mat);CHKERRCUDA(err);
467   }
468   try {
469     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
470     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
471     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
472     /* We want cusparseStruct to use PetscDefaultCudaStream
473     if (cusparseStruct->stream) {
474       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
475     }
476     */
477     delete cusparseStruct->coo_p;
478     delete cusparseStruct->coo_pw;
479     delete cusparseStruct;
480   } catch(char *ex) {
481     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
482   }
483   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
484   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
485   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
486   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
487   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
488   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
489   PetscFunctionReturn(0);
490 }
491 
492 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
493 {
494   PetscErrorCode     ierr;
495   Mat_MPIAIJ         *a;
496   Mat_MPIAIJCUSPARSE *cusparseStruct;
497   cusparseStatus_t   stat;
498   Mat                A;
499 
500   PetscFunctionBegin;
501   if (reuse == MAT_INITIAL_MATRIX) {
502     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
503   } else if (reuse == MAT_REUSE_MATRIX) {
504     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
505   }
506   A = *newmat;
507   A->boundtocpu = PETSC_FALSE;
508   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
509   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
510 
511   a = (Mat_MPIAIJ*)A->data;
512   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
513   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
514   if (a->lvec) {
515     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
516   }
517 
518   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
519     a->spptr = new Mat_MPIAIJCUSPARSE;
520 
521     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
522     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
523     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
524     cusparseStruct->coo_p               = NULL;
525     cusparseStruct->coo_pw              = NULL;
526     cusparseStruct->stream              = 0; /* We should not need cusparseStruct->stream */
527     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
528     stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
529     cusparseStruct->deviceMat = NULL;
530   }
531 
532   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
533   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
534   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
535   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
536   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
537   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
538   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
539   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
540 
541   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
542   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
543   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
544   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
545   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
546   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
547   PetscFunctionReturn(0);
548 }
549 
550 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
551 {
552   PetscErrorCode ierr;
553 
554   PetscFunctionBegin;
555   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
556   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
557   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
558   PetscFunctionReturn(0);
559 }
560 
561 /*@
562    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
563    (the default parallel PETSc format).  This matrix will ultimately pushed down
564    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
565    assembly performance the user should preallocate the matrix storage by setting
566    the parameter nz (or the array nnz).  By setting these parameters accurately,
567    performance during matrix assembly can be increased by more than a factor of 50.
568 
569    Collective
570 
571    Input Parameters:
572 +  comm - MPI communicator, set to PETSC_COMM_SELF
573 .  m - number of rows
574 .  n - number of columns
575 .  nz - number of nonzeros per row (same for all rows)
576 -  nnz - array containing the number of nonzeros in the various rows
577          (possibly different for each row) or NULL
578 
579    Output Parameter:
580 .  A - the matrix
581 
582    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
583    MatXXXXSetPreallocation() paradigm instead of this routine directly.
584    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
585 
586    Notes:
587    If nnz is given then nz is ignored
588 
589    The AIJ format (also called the Yale sparse matrix format or
590    compressed row storage), is fully compatible with standard Fortran 77
591    storage.  That is, the stored row and column indices can begin at
592    either one (as in Fortran) or zero.  See the users' manual for details.
593 
594    Specify the preallocated storage with either nz or nnz (not both).
595    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
596    allocation.  For large problems you MUST preallocate memory or you
597    will get TERRIBLE performance, see the users' manual chapter on matrices.
598 
599    By default, this format uses inodes (identical nodes) when possible, to
600    improve numerical efficiency of matrix-vector products and solves. We
601    search for consecutive rows with the same nonzero structure, thereby
602    reusing matrix information to achieve increased efficiency.
603 
604    Level: intermediate
605 
606 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
607 @*/
608 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
609 {
610   PetscErrorCode ierr;
611   PetscMPIInt    size;
612 
613   PetscFunctionBegin;
614   ierr = MatCreate(comm,A);CHKERRQ(ierr);
615   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
616   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
617   if (size > 1) {
618     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
619     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
620   } else {
621     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
622     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
623   }
624   PetscFunctionReturn(0);
625 }
626 
627 /*MC
628    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
629 
630    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
631    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
632    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
633 
634    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
635    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
636    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
637    for communicators controlling multiple processes.  It is recommended that you call both of
638    the above preallocation routines for simplicity.
639 
640    Options Database Keys:
641 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
642 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
643 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
644 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
645 
646   Level: beginner
647 
648  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
649 M
650 M*/
651 
652 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
653 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
654 {
655 #if defined(PETSC_USE_CTABLE)
656   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
657 #else
658   PetscSplitCSRDataStructure **p_d_mat;
659   PetscMPIInt                size,rank;
660   MPI_Comm                   comm;
661   PetscErrorCode             ierr;
662   int                        *ai,*bi,*aj,*bj;
663   PetscScalar                *aa,*ba;
664 
665   PetscFunctionBegin;
666   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
667   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
668   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
669   if (A->factortype == MAT_FACTOR_NONE) {
670     CsrMatrix *matrixA,*matrixB=NULL;
671     if (size == 1) {
672       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
673       p_d_mat = &cusparsestruct->deviceMat;
674       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
675       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
676         matrixA = (CsrMatrix*)matstruct->mat;
677         bi = bj = NULL; ba = NULL;
678       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
679     } else {
680       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
681       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
682       p_d_mat = &spptr->deviceMat;
683       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
684       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
685       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
686       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
687       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
688         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
689         matrixA = (CsrMatrix*)matstructA->mat;
690         matrixB = (CsrMatrix*)matstructB->mat;
691         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
692         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
693         ba = thrust::raw_pointer_cast(matrixB->values->data());
694       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
695     }
696     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
697     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
698     aa = thrust::raw_pointer_cast(matrixA->values->data());
699   } else {
700     *B = NULL;
701     PetscFunctionReturn(0);
702   }
703   // act like MatSetValues because not called on host
704   if (A->assembled) {
705     if (A->was_assembled) {
706       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
707     }
708     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
709   } else {
710     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
711   }
712   if (!*p_d_mat) {
713     cudaError_t                 err;
714     PetscSplitCSRDataStructure  *d_mat, h_mat;
715     Mat_SeqAIJ                  *jaca;
716     PetscInt                    n = A->rmap->n, nnz;
717     // create and copy
718     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
719     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
720     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
721     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
722     if (size == 1) {
723       jaca = (Mat_SeqAIJ*)A->data;
724       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
725       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
726       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
727       h_mat.offdiag.a = NULL;
728     } else {
729       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
730       Mat_SeqAIJ  *jacb;
731       jaca = (Mat_SeqAIJ*)aij->A->data;
732       jacb = (Mat_SeqAIJ*)aij->B->data;
733       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
734       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
735       // create colmap - this is ussually done (lazy) in MatSetValues
736       aij->donotstash = PETSC_TRUE;
737       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
738       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
739       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
740       aij->colmap[A->cmap->N] = -9;
741       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
742       {
743         PetscInt ii;
744         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
745       }
746       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
747       // allocate B copy data
748       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
749       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
750       nnz = jacb->i[n];
751 
752       if (jacb->compressedrow.use) {
753         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
754         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
755       } else h_mat.offdiag.i = bi;
756       h_mat.offdiag.j = bj;
757       h_mat.offdiag.a = ba;
758 
759       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
760       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
761       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
762       h_mat.offdiag.n = n;
763     }
764     // allocate A copy data
765     nnz = jaca->i[n];
766     h_mat.diag.n = n;
767     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
768     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
769     if (jaca->compressedrow.use) {
770       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
771       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
772     } else {
773       h_mat.diag.i = ai;
774     }
775     h_mat.diag.j = aj;
776     h_mat.diag.a = aa;
777     // copy pointers and metdata to device
778     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
779     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
780   } else {
781     *B = *p_d_mat;
782   }
783   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
784   PetscFunctionReturn(0);
785 #endif
786 }
787