xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision ed502f03247b19485c139b6abc5093f1f2fe799c)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 
11 struct VecCUDAEquals
12 {
13   template <typename Tuple>
14   __host__ __device__
15   void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
22 {
23   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
24   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
25   PetscInt           n = cusp->coo_nd + cusp->coo_no;
26   PetscErrorCode     ierr;
27   cudaError_t        cerr;
28 
29   PetscFunctionBegin;
30   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
31   if (cusp->coo_p && v) {
32     THRUSTARRAY *w;
33     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
34     w = new THRUSTARRAY(n);
35     w->assign(v,v+n);
36     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w->begin(),cusp->coo_p->begin()),
37                                                               cusp->coo_pw->begin()));
38     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w->begin(),cusp->coo_p->end()),
39                                                               cusp->coo_pw->end()));
40     thrust::for_each(zibit,zieit,VecCUDAEquals());
41     delete w;
42     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
43     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
44   } else {
45     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
46     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
47   }
48   cerr = WaitForCUDA();CHKERRCUDA(cerr);
49   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
50   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
51   A->num_ass++;
52   A->assembled        = PETSC_TRUE;
53   A->ass_nonzerostate = A->nonzerostate;
54   A->offloadmask      = PETSC_OFFLOAD_BOTH;
55   PetscFunctionReturn(0);
56 }
57 
58 template <typename Tuple>
59 struct IsNotOffDiagT
60 {
61   PetscInt _cstart,_cend;
62 
63   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
64   __host__ __device__
65   inline bool operator()(Tuple t)
66   {
67     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
68   }
69 };
70 
71 struct IsOffDiag
72 {
73   PetscInt _cstart,_cend;
74 
75   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
76   __host__ __device__
77   inline bool operator() (const PetscInt &c)
78   {
79     return c < _cstart || c >= _cend;
80   }
81 };
82 
83 struct GlobToLoc
84 {
85   PetscInt _start;
86 
87   GlobToLoc(PetscInt start) : _start(start) {}
88   __host__ __device__
89   inline PetscInt operator() (const PetscInt &c)
90   {
91     return c - _start;
92   }
93 };
94 
95 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
96 {
97   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
98   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
99   PetscErrorCode         ierr;
100   PetscInt               *jj;
101   size_t                 noff = 0;
102   THRUSTINTARRAY         d_i(n);
103   THRUSTINTARRAY         d_j(n);
104   ISLocalToGlobalMapping l2g;
105   cudaError_t            cerr;
106 
107   PetscFunctionBegin;
108   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
109   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
110   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
111   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
112   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
113   ierr = PetscFree(b->garray);CHKERRQ(ierr);
114   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
115   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
116   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
117 
118   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
119   d_i.assign(coo_i,coo_i+n);
120   d_j.assign(coo_j,coo_j+n);
121   delete cusp->coo_p;
122   delete cusp->coo_pw;
123   cusp->coo_p = NULL;
124   cusp->coo_pw = NULL;
125   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
126   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
127   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
128     cusp->coo_p = new THRUSTINTARRAY(n);
129     cusp->coo_pw = new THRUSTARRAY(n);
130     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
131     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
132     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
133     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
134     firstoffd = mzipp.get_iterator_tuple().get<1>();
135   }
136   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
137   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
138 
139   /* from global to local */
140   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
141   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
142 
143   /* copy offdiag column indices to map on the CPU */
144   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
145   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
146   auto o_j = d_j.begin();
147   thrust::advance(o_j,cusp->coo_nd);
148   thrust::sort(thrust::device,o_j,d_j.end());
149   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
150   noff = thrust::distance(o_j,wit);
151   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
152   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
154   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
155   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
156   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
157   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
158   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
159 
160   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
161   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
162   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
163   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
164   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
165   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
166   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
167   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
168 
169   /* GPU memory, cusparse specific call handles it internally */
170   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
171   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
172   ierr = PetscFree(jj);CHKERRQ(ierr);
173 
174   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
175   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
176   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
177   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
178   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
179   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
180   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
181   B->preallocated = PETSC_TRUE;
182   B->nonzerostate++;
183   cerr = WaitForCUDA();CHKERRCUDA(cerr);
184   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
185 
186   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
187   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
188   B->offloadmask = PETSC_OFFLOAD_CPU;
189   B->assembled = PETSC_FALSE;
190   B->was_assembled = PETSC_FALSE;
191   PetscFunctionReturn(0);
192 }
193 
194 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
195 {
196   Mat            Ad,Ao;
197   const PetscInt *cmap;
198   PetscErrorCode ierr;
199 
200   PetscFunctionBegin;
201   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
202   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
203   if (glob) {
204     PetscInt cst, i, dn, on, *gidx;
205 
206     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
207     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
208     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
209     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
210     for (i=0; i<dn; i++) gidx[i]    = cst + i;
211     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
212     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
213   }
214   PetscFunctionReturn(0);
215 }
216 
217 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
218 {
219   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
220   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
221   PetscErrorCode     ierr;
222   PetscInt           i;
223 
224   PetscFunctionBegin;
225   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
226   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
227   if (PetscDefined(USE_DEBUG) && d_nnz) {
228     for (i=0; i<B->rmap->n; i++) {
229       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
230     }
231   }
232   if (PetscDefined(USE_DEBUG) && o_nnz) {
233     for (i=0; i<B->rmap->n; i++) {
234       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
235     }
236   }
237   if (!B->preallocated) {
238     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
239     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
240     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
241     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
242     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
243     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
244     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
245     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
246     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
247   }
248   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
249   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
250   if (b->lvec) {
251     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
252   }
253   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
254   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
255   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
256   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
257   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
258   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
259   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
260   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
261 
262   B->preallocated = PETSC_TRUE;
263   PetscFunctionReturn(0);
264 }
265 
266 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
267 {
268   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
269   PetscErrorCode ierr;
270   PetscInt       nt;
271 
272   PetscFunctionBegin;
273   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
274   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
275   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
276   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
277   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
278   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
279   PetscFunctionReturn(0);
280 }
281 
282 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
283 {
284   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
285   PetscErrorCode ierr;
286 
287   PetscFunctionBegin;
288   if (A->factortype == MAT_FACTOR_NONE) {
289     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
290     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
291     if (d_mat) {
292       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
293       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
294       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
295       cudaError_t  err;
296       PetscScalar  *vals;
297       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
298       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
299       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
300       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
301       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
302     }
303   }
304   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
305   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
306 
307   PetscFunctionReturn(0);
308 }
309 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
310 {
311   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
312   PetscErrorCode ierr;
313   PetscInt       nt;
314 
315   PetscFunctionBegin;
316   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
317   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
318   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
319   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
320   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
321   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
322   PetscFunctionReturn(0);
323 }
324 
325 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
326 {
327   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
328   PetscErrorCode ierr;
329   PetscInt       nt;
330 
331   PetscFunctionBegin;
332   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
333   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
334   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
335   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
336   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
337   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
338   PetscFunctionReturn(0);
339 }
340 
341 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
342 {
343   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
344   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
345 
346   PetscFunctionBegin;
347   switch (op) {
348   case MAT_CUSPARSE_MULT_DIAG:
349     cusparseStruct->diagGPUMatFormat = format;
350     break;
351   case MAT_CUSPARSE_MULT_OFFDIAG:
352     cusparseStruct->offdiagGPUMatFormat = format;
353     break;
354   case MAT_CUSPARSE_ALL:
355     cusparseStruct->diagGPUMatFormat    = format;
356     cusparseStruct->offdiagGPUMatFormat = format;
357     break;
358   default:
359     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
360   }
361   PetscFunctionReturn(0);
362 }
363 
364 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
365 {
366   MatCUSPARSEStorageFormat format;
367   PetscErrorCode           ierr;
368   PetscBool                flg;
369   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
370   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
371 
372   PetscFunctionBegin;
373   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
374   if (A->factortype==MAT_FACTOR_NONE) {
375     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
376                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
377     if (flg) {
378       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
379     }
380     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
381                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
382     if (flg) {
383       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
384     }
385     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
386                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
387     if (flg) {
388       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
389     }
390   }
391   ierr = PetscOptionsTail();CHKERRQ(ierr);
392   PetscFunctionReturn(0);
393 }
394 
395 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
396 {
397   PetscErrorCode             ierr;
398   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
399   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
400   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
401   PetscFunctionBegin;
402   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
403   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
404     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
405   }
406   if (d_mat) {
407     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
408   }
409 
410   PetscFunctionReturn(0);
411 }
412 
413 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
414 {
415   PetscErrorCode     ierr;
416   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
417   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
418   cudaError_t        err;
419   cusparseStatus_t   stat;
420 
421   PetscFunctionBegin;
422   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
423   if (cusparseStruct->deviceMat) {
424     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
425     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
426     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
427     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
428     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
429     if (jaca->compressedrow.use) {
430       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
431     }
432     if (jacb->compressedrow.use) {
433       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
434     }
435     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
436     err = cudaFree(d_mat);CHKERRCUDA(err);
437   }
438   try {
439     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
440     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
441     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
442     if (cusparseStruct->stream) {
443       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
444     }
445     delete cusparseStruct->coo_p;
446     delete cusparseStruct->coo_pw;
447     delete cusparseStruct;
448   } catch(char *ex) {
449     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
450   }
451   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
452   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
453   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
454   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
455   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
456   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
457   PetscFunctionReturn(0);
458 }
459 
460 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
461 {
462   PetscErrorCode     ierr;
463   Mat_MPIAIJ         *a;
464   Mat_MPIAIJCUSPARSE *cusparseStruct;
465   cusparseStatus_t   stat;
466   Mat                A;
467 
468   PetscFunctionBegin;
469   if (reuse == MAT_INITIAL_MATRIX) {
470     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
471   } else if (reuse == MAT_REUSE_MATRIX) {
472     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
473   }
474   A = *newmat;
475 
476   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
477   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
478 
479   a = (Mat_MPIAIJ*)A->data;
480   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
481   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
482   if (a->lvec) {
483     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
484   }
485 
486   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
487     a->spptr = new Mat_MPIAIJCUSPARSE;
488 
489     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
490     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
491     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
492     cusparseStruct->coo_p               = NULL;
493     cusparseStruct->coo_pw              = NULL;
494     cusparseStruct->stream              = 0;
495     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
496     cusparseStruct->deviceMat = NULL;
497   }
498 
499   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
500   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
501   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
502   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
503   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
504   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
505   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
506 
507   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
508   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
509   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
510   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
511   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
512   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
513   PetscFunctionReturn(0);
514 }
515 
516 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
517 {
518   PetscErrorCode ierr;
519 
520   PetscFunctionBegin;
521   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
522   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
523   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 /*@
528    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
529    (the default parallel PETSc format).  This matrix will ultimately pushed down
530    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
531    assembly performance the user should preallocate the matrix storage by setting
532    the parameter nz (or the array nnz).  By setting these parameters accurately,
533    performance during matrix assembly can be increased by more than a factor of 50.
534 
535    Collective
536 
537    Input Parameters:
538 +  comm - MPI communicator, set to PETSC_COMM_SELF
539 .  m - number of rows
540 .  n - number of columns
541 .  nz - number of nonzeros per row (same for all rows)
542 -  nnz - array containing the number of nonzeros in the various rows
543          (possibly different for each row) or NULL
544 
545    Output Parameter:
546 .  A - the matrix
547 
548    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
549    MatXXXXSetPreallocation() paradigm instead of this routine directly.
550    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
551 
552    Notes:
553    If nnz is given then nz is ignored
554 
555    The AIJ format (also called the Yale sparse matrix format or
556    compressed row storage), is fully compatible with standard Fortran 77
557    storage.  That is, the stored row and column indices can begin at
558    either one (as in Fortran) or zero.  See the users' manual for details.
559 
560    Specify the preallocated storage with either nz or nnz (not both).
561    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
562    allocation.  For large problems you MUST preallocate memory or you
563    will get TERRIBLE performance, see the users' manual chapter on matrices.
564 
565    By default, this format uses inodes (identical nodes) when possible, to
566    improve numerical efficiency of matrix-vector products and solves. We
567    search for consecutive rows with the same nonzero structure, thereby
568    reusing matrix information to achieve increased efficiency.
569 
570    Level: intermediate
571 
572 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
573 @*/
574 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
575 {
576   PetscErrorCode ierr;
577   PetscMPIInt    size;
578 
579   PetscFunctionBegin;
580   ierr = MatCreate(comm,A);CHKERRQ(ierr);
581   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
582   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
583   if (size > 1) {
584     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
585     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
586   } else {
587     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
588     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
589   }
590   PetscFunctionReturn(0);
591 }
592 
593 /*MC
594    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
595 
596    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
597    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
598    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
599 
600    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
601    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
602    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
603    for communicators controlling multiple processes.  It is recommended that you call both of
604    the above preallocation routines for simplicity.
605 
606    Options Database Keys:
607 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
608 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
609 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
610 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
611 
612   Level: beginner
613 
614  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
615 M
616 M*/
617 
618 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
619 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
620 {
621 #if defined(PETSC_USE_CTABLE)
622   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
623 #else
624   PetscSplitCSRDataStructure **p_d_mat;
625   PetscMPIInt                size,rank;
626   MPI_Comm                   comm;
627   PetscErrorCode             ierr;
628   int                        *ai,*bi,*aj,*bj;
629   PetscScalar                *aa,*ba;
630 
631   PetscFunctionBegin;
632   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
633   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
634   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
635   if (A->factortype == MAT_FACTOR_NONE) {
636     CsrMatrix *matrixA,*matrixB=NULL;
637     if (size == 1) {
638       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
639       p_d_mat = &cusparsestruct->deviceMat;
640       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
641       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
642         matrixA = (CsrMatrix*)matstruct->mat;
643         bi = bj = NULL; ba = NULL;
644       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
645     } else {
646       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
647       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
648       p_d_mat = &spptr->deviceMat;
649       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
650       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
651       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
652       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
653       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
654         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
655         matrixA = (CsrMatrix*)matstructA->mat;
656         matrixB = (CsrMatrix*)matstructB->mat;
657         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
658         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
659         ba = thrust::raw_pointer_cast(matrixB->values->data());
660       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
661     }
662     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
663     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
664     aa = thrust::raw_pointer_cast(matrixA->values->data());
665   } else {
666     *B = NULL;
667     PetscFunctionReturn(0);
668   }
669   // act like MatSetValues because not called on host
670   if (A->assembled) {
671     if (A->was_assembled) {
672       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
673     }
674     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
675   } else {
676     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
677   }
678   if (!*p_d_mat) {
679     cudaError_t                 err;
680     PetscSplitCSRDataStructure  *d_mat, h_mat;
681     Mat_SeqAIJ                  *jaca;
682     PetscInt                    n = A->rmap->n, nnz;
683     // create and copy
684     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
685     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
686     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
687     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
688     if (size == 1) {
689       jaca = (Mat_SeqAIJ*)A->data;
690       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
691       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
692       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
693       h_mat.offdiag.a = NULL;
694     } else {
695       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
696       Mat_SeqAIJ  *jacb;
697       jaca = (Mat_SeqAIJ*)aij->A->data;
698       jacb = (Mat_SeqAIJ*)aij->B->data;
699       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
700       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
701       // create colmap - this is ussually done (lazy) in MatSetValues
702       aij->donotstash = PETSC_TRUE;
703       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
704       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
705       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
706       aij->colmap[A->cmap->N] = -9;
707       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
708       {
709         PetscInt ii;
710         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
711       }
712       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
713       // allocate B copy data
714       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
715       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
716       nnz = jacb->i[n];
717 
718       if (jacb->compressedrow.use) {
719         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
720         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
721       } else h_mat.offdiag.i = bi;
722       h_mat.offdiag.j = bj;
723       h_mat.offdiag.a = ba;
724 
725       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
726       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
727       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
728       h_mat.offdiag.n = n;
729     }
730     // allocate A copy data
731     nnz = jaca->i[n];
732     h_mat.diag.n = n;
733     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
734     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
735     if (jaca->compressedrow.use) {
736       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
737       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
738     } else {
739       h_mat.diag.i = ai;
740     }
741     h_mat.diag.j = aj;
742     h_mat.diag.a = aa;
743     // copy pointers and metdata to device
744     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
745     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
746   } else {
747     *B = *p_d_mat;
748   }
749   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
750   PetscFunctionReturn(0);
751 #endif
752 }
753