xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2cb5e1cc91ad4e0472b8976614576d28ebef7100)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 
11 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
12 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
25 {
26   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
27   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
28   PetscInt           n = cusp->coo_nd + cusp->coo_no;
29   PetscErrorCode     ierr;
30   cudaError_t        cerr;
31 
32   PetscFunctionBegin;
33   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
34   if (cusp->coo_p) {
35     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
36     THRUSTARRAY w(v,v+n);
37     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->begin()),
38                                                               cusp->coo_pw->begin()));
39     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->end()),
40                                                               cusp->coo_pw->end()));
41     thrust::for_each(zibit,zieit,VecCUDAEquals());
42     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
43     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
44   } else {
45     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
46     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
47   }
48   cerr = WaitForCUDA();CHKERRCUDA(cerr);
49   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
50   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
51   A->num_ass++;
52   A->assembled        = PETSC_TRUE;
53   A->ass_nonzerostate = A->nonzerostate;
54   A->offloadmask      = PETSC_OFFLOAD_BOTH;
55   PetscFunctionReturn(0);
56 }
57 
58 template <typename Tuple>
59 struct IsNotOffDiagT
60 {
61   PetscInt _cstart,_cend;
62 
63   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
64   __host__ __device__
65   inline bool operator()(Tuple t)
66   {
67     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
68   }
69 };
70 
71 struct IsOffDiag
72 {
73   PetscInt _cstart,_cend;
74 
75   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
76   __host__ __device__
77   inline bool operator() (const PetscInt &c)
78   {
79     return c < _cstart || c >= _cend;
80   }
81 };
82 
83 struct GlobToLoc
84 {
85   PetscInt _start;
86 
87   GlobToLoc(PetscInt start) : _start(start) {}
88   __host__ __device__
89   inline PetscInt operator() (const PetscInt &c)
90   {
91     return c - _start;
92   }
93 };
94 
95 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
96 {
97   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
98   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
99   PetscErrorCode         ierr;
100   PetscInt               *jj;
101   size_t                 noff = 0;
102   THRUSTINTARRAY         d_i(n);
103   THRUSTINTARRAY         d_j(n);
104   ISLocalToGlobalMapping l2g;
105   cudaError_t            cerr;
106 
107   PetscFunctionBegin;
108   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
109   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
110   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
111   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
112   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
113   ierr = PetscFree(b->garray);CHKERRQ(ierr);
114   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
115   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
116   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
117 
118   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
119   d_i.assign(coo_i,coo_i+n);
120   d_j.assign(coo_j,coo_j+n);
121   delete cusp->coo_p;
122   delete cusp->coo_pw;
123   cusp->coo_p = NULL;
124   cusp->coo_pw = NULL;
125   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
126   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
127   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
128     cusp->coo_p = new THRUSTINTARRAY(n);
129     cusp->coo_pw = new THRUSTARRAY(n);
130     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
131     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
132     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
133     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
134     firstoffd = mzipp.get_iterator_tuple().get<1>();
135   }
136   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
137   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
138 
139   /* from global to local */
140   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
141   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
142 
143   /* copy offdiag column indices to map on the CPU */
144   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
145   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
146   auto o_j = d_j.begin();
147   thrust::advance(o_j,cusp->coo_nd);
148   thrust::sort(thrust::device,o_j,d_j.end());
149   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
150   noff = thrust::distance(o_j,wit);
151   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
152   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
154   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
155   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
156   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
157   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
158   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
159 
160   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
161   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
162   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
163   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
164   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
165   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
166   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
167   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
168 
169   /* GPU memory, cusparse specific call handles it internally */
170   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
171   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
172   ierr = PetscFree(jj);CHKERRQ(ierr);
173 
174   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
175   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
176   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
177   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
178   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
179   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
180   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
181   B->preallocated = PETSC_TRUE;
182   B->nonzerostate++;
183   cerr = WaitForCUDA();CHKERRCUDA(cerr);
184   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
185 
186   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
187   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
188   B->offloadmask = PETSC_OFFLOAD_CPU;
189   B->assembled = PETSC_FALSE;
190   B->was_assembled = PETSC_FALSE;
191   PetscFunctionReturn(0);
192 }
193 
194 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
195 {
196   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
197   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
198   PetscErrorCode     ierr;
199   PetscInt           i;
200 
201   PetscFunctionBegin;
202   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
203   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
204   if (PetscDefined(USE_DEBUG) && d_nnz) {
205     for (i=0; i<B->rmap->n; i++) {
206       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
207     }
208   }
209   if (PetscDefined(USE_DEBUG) && o_nnz) {
210     for (i=0; i<B->rmap->n; i++) {
211       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
212     }
213   }
214   if (!B->preallocated) {
215     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
216     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
217     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
218     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
219     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
220     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
221     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
222     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
223     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
224   }
225   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
226   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
227   if (b->lvec) {
228     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
229   }
230   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
231   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
232   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
233   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
234   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
235   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
236   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
237   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
238 
239   B->preallocated = PETSC_TRUE;
240   PetscFunctionReturn(0);
241 }
242 
243 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
244 {
245   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
246   PetscErrorCode ierr;
247   PetscInt       nt;
248 
249   PetscFunctionBegin;
250   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
251   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
252   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
253   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
254   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
255   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
260 {
261   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
262   PetscErrorCode ierr;
263 
264   PetscFunctionBegin;
265   if (A->factortype == MAT_FACTOR_NONE) {
266     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
267     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
268     if (d_mat) {
269       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
270       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
271       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
272       cudaError_t  err;
273       PetscScalar  *vals;
274       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
275       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
276       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
277       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
278       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
279     }
280   }
281   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
282   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
283 
284   PetscFunctionReturn(0);
285 }
286 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
287 {
288   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
289   PetscErrorCode ierr;
290   PetscInt       nt;
291 
292   PetscFunctionBegin;
293   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
294   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
296   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
297   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
298   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
303 {
304   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
305   PetscErrorCode ierr;
306   PetscInt       nt;
307 
308   PetscFunctionBegin;
309   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
310   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
311   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
312   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
313   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
314   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
319 {
320   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
321   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
322 
323   PetscFunctionBegin;
324   switch (op) {
325   case MAT_CUSPARSE_MULT_DIAG:
326     cusparseStruct->diagGPUMatFormat = format;
327     break;
328   case MAT_CUSPARSE_MULT_OFFDIAG:
329     cusparseStruct->offdiagGPUMatFormat = format;
330     break;
331   case MAT_CUSPARSE_ALL:
332     cusparseStruct->diagGPUMatFormat    = format;
333     cusparseStruct->offdiagGPUMatFormat = format;
334     break;
335   default:
336     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
337   }
338   PetscFunctionReturn(0);
339 }
340 
341 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
342 {
343   MatCUSPARSEStorageFormat format;
344   PetscErrorCode           ierr;
345   PetscBool                flg;
346   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
347   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
348 
349   PetscFunctionBegin;
350   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
351   if (A->factortype==MAT_FACTOR_NONE) {
352     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
353                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
354     if (flg) {
355       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
356     }
357     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
358                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
359     if (flg) {
360       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
361     }
362     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
363                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
364     if (flg) {
365       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
366     }
367   }
368   ierr = PetscOptionsTail();CHKERRQ(ierr);
369   PetscFunctionReturn(0);
370 }
371 
372 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
373 {
374   PetscErrorCode             ierr;
375   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
376   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
377   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
378   PetscInt                   nnz_state = A->nonzerostate;
379   PetscFunctionBegin;
380   if (d_mat) {
381     cudaError_t                err;
382     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
383   }
384   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
385   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
386     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
387   }
388   if (nnz_state > A->nonzerostate) {
389     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
390   }
391 
392   PetscFunctionReturn(0);
393 }
394 
395 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
396 {
397   PetscErrorCode     ierr;
398   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
399   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
400   cudaError_t        err;
401   cusparseStatus_t   stat;
402 
403   PetscFunctionBegin;
404   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
405   if (cusparseStruct->deviceMat) {
406     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
407     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
408     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
409     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
410     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
411     if (jaca->compressedrow.use) {
412       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
413     }
414     if (jacb->compressedrow.use) {
415       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
416     }
417     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
418     err = cudaFree(d_mat);CHKERRCUDA(err);
419   }
420   try {
421     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
422     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
423     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
424     if (cusparseStruct->stream) {
425       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
426     }
427     delete cusparseStruct->coo_p;
428     delete cusparseStruct->coo_pw;
429     delete cusparseStruct;
430   } catch(char *ex) {
431     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
432   }
433   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
434   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
435   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
436   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
437   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
438   PetscFunctionReturn(0);
439 }
440 
441 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
442 {
443   PetscErrorCode     ierr;
444   Mat_MPIAIJ         *a;
445   Mat_MPIAIJCUSPARSE *cusparseStruct;
446   cusparseStatus_t   stat;
447   Mat                A;
448 
449   PetscFunctionBegin;
450   if (reuse == MAT_INITIAL_MATRIX) {
451     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
452   } else if (reuse == MAT_REUSE_MATRIX) {
453     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
454   }
455   A = *newmat;
456 
457   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
458   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
459 
460   a = (Mat_MPIAIJ*)A->data;
461   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
462   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
463   if (a->lvec) {
464     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
465   }
466 
467   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
468     a->spptr = new Mat_MPIAIJCUSPARSE;
469 
470     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
471     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
472     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
473     cusparseStruct->coo_p               = NULL;
474     cusparseStruct->coo_pw              = NULL;
475     cusparseStruct->stream              = 0;
476     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
477     cusparseStruct->deviceMat = NULL;
478   }
479 
480   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
481   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
482   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
483   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
484   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
485   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
486   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
487 
488   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
489   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
490   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
491   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
492   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
493   PetscFunctionReturn(0);
494 }
495 
496 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
497 {
498   PetscErrorCode ierr;
499 
500   PetscFunctionBegin;
501   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
502   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
503   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 /*@
508    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
509    (the default parallel PETSc format).  This matrix will ultimately pushed down
510    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
511    assembly performance the user should preallocate the matrix storage by setting
512    the parameter nz (or the array nnz).  By setting these parameters accurately,
513    performance during matrix assembly can be increased by more than a factor of 50.
514 
515    Collective
516 
517    Input Parameters:
518 +  comm - MPI communicator, set to PETSC_COMM_SELF
519 .  m - number of rows
520 .  n - number of columns
521 .  nz - number of nonzeros per row (same for all rows)
522 -  nnz - array containing the number of nonzeros in the various rows
523          (possibly different for each row) or NULL
524 
525    Output Parameter:
526 .  A - the matrix
527 
528    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
529    MatXXXXSetPreallocation() paradigm instead of this routine directly.
530    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
531 
532    Notes:
533    If nnz is given then nz is ignored
534 
535    The AIJ format (also called the Yale sparse matrix format or
536    compressed row storage), is fully compatible with standard Fortran 77
537    storage.  That is, the stored row and column indices can begin at
538    either one (as in Fortran) or zero.  See the users' manual for details.
539 
540    Specify the preallocated storage with either nz or nnz (not both).
541    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
542    allocation.  For large problems you MUST preallocate memory or you
543    will get TERRIBLE performance, see the users' manual chapter on matrices.
544 
545    By default, this format uses inodes (identical nodes) when possible, to
546    improve numerical efficiency of matrix-vector products and solves. We
547    search for consecutive rows with the same nonzero structure, thereby
548    reusing matrix information to achieve increased efficiency.
549 
550    Level: intermediate
551 
552 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
553 @*/
554 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
555 {
556   PetscErrorCode ierr;
557   PetscMPIInt    size;
558 
559   PetscFunctionBegin;
560   ierr = MatCreate(comm,A);CHKERRQ(ierr);
561   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
562   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
563   if (size > 1) {
564     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
565     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
566   } else {
567     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
568     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
569   }
570   PetscFunctionReturn(0);
571 }
572 
573 /*MC
574    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
575 
576    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
577    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
578    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
579 
580    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
581    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
582    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
583    for communicators controlling multiple processes.  It is recommended that you call both of
584    the above preallocation routines for simplicity.
585 
586    Options Database Keys:
587 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
588 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
589 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
590 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
591 
592   Level: beginner
593 
594  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
595 M
596 M*/
597 
598 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
599 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
600 {
601 #if defined(PETSC_USE_CTABLE)
602   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
603 #else
604   PetscSplitCSRDataStructure **p_d_mat;
605   PetscMPIInt                size,rank;
606   MPI_Comm                   comm;
607   PetscErrorCode             ierr;
608   int                        *ai,*bi,*aj,*bj;
609   PetscScalar                *aa,*ba;
610 
611   PetscFunctionBegin;
612   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
613   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
614   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
615   if (A->factortype == MAT_FACTOR_NONE) {
616     CsrMatrix *matrixA,*matrixB=NULL;
617     if (size == 1) {
618       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
619       p_d_mat = &cusparsestruct->deviceMat;
620       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
621       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
622 	matrixA = (CsrMatrix*)matstruct->mat;
623 	bi = bj = NULL; ba = NULL;
624       } else {
625 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
626       }
627     } else {
628       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
629       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
630       p_d_mat = &spptr->deviceMat;
631       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
632       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
633       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
634       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
635       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
636 	if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
637 	matrixA = (CsrMatrix*)matstructA->mat;
638 	matrixB = (CsrMatrix*)matstructB->mat;
639 	bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
640 	bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
641 	ba = thrust::raw_pointer_cast(matrixB->values->data());
642 	if (rank==-1) {
643 	  for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
644 	    std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
645 	  for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
646 	    std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
647 	  for(unsigned int i = 0; i < matrixB->values->size(); i++)
648 	    std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
649 	}
650       } else {
651 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
652       }
653     }
654     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
655     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
656     aa = thrust::raw_pointer_cast(matrixA->values->data());
657   } else {
658     *B = NULL;
659     PetscFunctionReturn(0);
660   }
661   // act like MatSetValues because not called on host
662   if (A->assembled) {
663     if (A->was_assembled) {
664       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
665     }
666     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
667   } else {
668     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
669   }
670   if (!*p_d_mat) {
671     cudaError_t                 err;
672     PetscSplitCSRDataStructure  *d_mat, h_mat;
673     Mat_SeqAIJ                  *jaca;
674     PetscInt                    n = A->rmap->n, nnz;
675     // create and copy
676     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
677     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
678     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
679     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
680     if (size == 1) {
681       jaca = (Mat_SeqAIJ*)A->data;
682       h_mat.nonzerostate = A->nonzerostate;
683       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
684       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
685       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
686       h_mat.offdiag.a = NULL;
687       h_mat.seq = PETSC_TRUE;
688     } else {
689       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
690       Mat_SeqAIJ  *jacb;
691       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
692       jaca = (Mat_SeqAIJ*)aij->A->data;
693       jacb = (Mat_SeqAIJ*)aij->B->data;
694       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
695       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
696       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
697       // create colmap - this is ussually done (lazy) in MatSetValues
698       aij->donotstash = PETSC_TRUE;
699       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
700       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
701       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
702       aij->colmap[A->cmap->N] = -9;
703       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
704       {
705 	PetscInt ii;
706 	for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
707       }
708       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
709       // allocate B copy data
710       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
711       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
712       nnz = jacb->i[n];
713 
714       if (jacb->compressedrow.use) {
715 	err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
716 	err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
717       } else {
718 	h_mat.offdiag.i = bi;
719       }
720       h_mat.offdiag.j = bj;
721       h_mat.offdiag.a = ba;
722 
723       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
724       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
725       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
726       h_mat.offdiag.n = n;
727     }
728     // allocate A copy data
729     nnz = jaca->i[n];
730     h_mat.diag.n = n;
731     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
732     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
733     if (jaca->compressedrow.use) {
734       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
735       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
736     } else {
737       h_mat.diag.i = ai;
738     }
739     h_mat.diag.j = aj;
740     h_mat.diag.a = aa;
741     // copy pointers and metdata to device
742     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
743     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
744   } else {
745     *B = *p_d_mat;
746   }
747   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
748   PetscFunctionReturn(0);
749 #endif
750 }
751