xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision b698fc57f0bea7237255b29c1b77df0acc362ffd)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 
11 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
12 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
25 {
26   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
27   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
28   PetscInt           n = cusp->coo_nd + cusp->coo_no;
29   PetscErrorCode     ierr;
30   cudaError_t        cerr;
31 
32   PetscFunctionBegin;
33   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
34   if (cusp->coo_p) {
35     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
36     THRUSTARRAY w(v,v+n);
37     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->begin()),
38                                                               cusp->coo_pw->begin()));
39     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->end()),
40                                                               cusp->coo_pw->end()));
41     thrust::for_each(zibit,zieit,VecCUDAEquals());
42     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
43     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
44   } else {
45     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
46     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
47   }
48   cerr = WaitForCUDA();CHKERRCUDA(cerr);
49   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
50   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
51   A->num_ass++;
52   A->assembled        = PETSC_TRUE;
53   A->ass_nonzerostate = A->nonzerostate;
54   A->offloadmask      = PETSC_OFFLOAD_BOTH;
55   PetscFunctionReturn(0);
56 }
57 
58 template <typename Tuple>
59 struct IsNotOffDiagT
60 {
61   PetscInt _cstart,_cend;
62 
63   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
64   __host__ __device__
65   inline bool operator()(Tuple t)
66   {
67     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
68   }
69 };
70 
71 struct IsOffDiag
72 {
73   PetscInt _cstart,_cend;
74 
75   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
76   __host__ __device__
77   inline bool operator() (const PetscInt &c)
78   {
79     return c < _cstart || c >= _cend;
80   }
81 };
82 
83 struct GlobToLoc
84 {
85   PetscInt _start;
86 
87   GlobToLoc(PetscInt start) : _start(start) {}
88   __host__ __device__
89   inline PetscInt operator() (const PetscInt &c)
90   {
91     return c - _start;
92   }
93 };
94 
95 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
96 {
97   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
98   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
99   PetscErrorCode         ierr;
100   PetscInt               *jj;
101   size_t                 noff = 0;
102   THRUSTINTARRAY         d_i(n);
103   THRUSTINTARRAY         d_j(n);
104   ISLocalToGlobalMapping l2g;
105   cudaError_t            cerr;
106 
107   PetscFunctionBegin;
108   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
109   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
110   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
111   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
112   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
113   ierr = PetscFree(b->garray);CHKERRQ(ierr);
114   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
115   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
116   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
117 
118   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
119   d_i.assign(coo_i,coo_i+n);
120   d_j.assign(coo_j,coo_j+n);
121   delete cusp->coo_p;
122   delete cusp->coo_pw;
123   cusp->coo_p = NULL;
124   cusp->coo_pw = NULL;
125   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
126   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
127   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
128     cusp->coo_p = new THRUSTINTARRAY(n);
129     cusp->coo_pw = new THRUSTARRAY(n);
130     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
131     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
132     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
133     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
134     firstoffd = mzipp.get_iterator_tuple().get<1>();
135   }
136   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
137   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
138 
139   /* from global to local */
140   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
141   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
142 
143   /* copy offdiag column indices to map on the CPU */
144   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
145   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
146   auto o_j = d_j.begin();
147   thrust::advance(o_j,cusp->coo_nd);
148   thrust::sort(thrust::device,o_j,d_j.end());
149   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
150   noff = thrust::distance(o_j,wit);
151   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
152   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
154   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
155   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
156   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
157   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
158   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
159 
160   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
161   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
162   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
163   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
164   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
165   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
166   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
167   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
168 
169   /* GPU memory, cusparse specific call handles it internally */
170   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
171   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
172   ierr = PetscFree(jj);CHKERRQ(ierr);
173 
174   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
175   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
176   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
177   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
178   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
179   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
180   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
181   B->preallocated = PETSC_TRUE;
182   B->nonzerostate++;
183   cerr = WaitForCUDA();CHKERRCUDA(cerr);
184   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
185 
186   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
187   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
188   B->offloadmask = PETSC_OFFLOAD_CPU;
189   B->assembled = PETSC_FALSE;
190   B->was_assembled = PETSC_FALSE;
191   PetscFunctionReturn(0);
192 }
193 
194 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
195 {
196   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
197   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
198   PetscErrorCode     ierr;
199   PetscInt           i;
200 
201   PetscFunctionBegin;
202   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
203   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
204   if (PetscDefined(USE_DEBUG) && d_nnz) {
205     for (i=0; i<B->rmap->n; i++) {
206       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
207     }
208   }
209   if (PetscDefined(USE_DEBUG) && o_nnz) {
210     for (i=0; i<B->rmap->n; i++) {
211       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
212     }
213   }
214   if (!B->preallocated) {
215     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
216     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
217     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
218     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
219     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
220     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
221     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
222     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
223     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
224   }
225   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
226   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
227   if (b->lvec) {
228     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
229   }
230   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
231   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
232   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
233   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
234   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
235   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
236   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
237   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
238 
239   B->preallocated = PETSC_TRUE;
240   PetscFunctionReturn(0);
241 }
242 
243 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
244 {
245   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
246   PetscErrorCode ierr;
247   PetscInt       nt;
248 
249   PetscFunctionBegin;
250   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
251   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
252   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
253   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
254   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
255   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
260 {
261   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
262   PetscErrorCode ierr;
263 
264   PetscFunctionBegin;
265   if (A->factortype == MAT_FACTOR_NONE) {
266     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
267     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
268     if (d_mat) {
269       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
270       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
271       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
272       cudaError_t  err;
273       PetscScalar  *vals;
274       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
275       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
276       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
277       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
278       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
279     }
280   }
281   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
282   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
283 
284   PetscFunctionReturn(0);
285 }
286 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
287 {
288   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
289   PetscErrorCode ierr;
290   PetscInt       nt;
291 
292   PetscFunctionBegin;
293   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
294   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
296   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
297   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
298   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
303 {
304   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
305   PetscErrorCode ierr;
306   PetscInt       nt;
307 
308   PetscFunctionBegin;
309   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
310   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
311   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
312   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
313   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
314   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
319 {
320   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
321   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
322 
323   PetscFunctionBegin;
324   switch (op) {
325   case MAT_CUSPARSE_MULT_DIAG:
326     cusparseStruct->diagGPUMatFormat = format;
327     break;
328   case MAT_CUSPARSE_MULT_OFFDIAG:
329     cusparseStruct->offdiagGPUMatFormat = format;
330     break;
331   case MAT_CUSPARSE_ALL:
332     cusparseStruct->diagGPUMatFormat    = format;
333     cusparseStruct->offdiagGPUMatFormat = format;
334     break;
335   default:
336     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
337   }
338   PetscFunctionReturn(0);
339 }
340 
341 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
342 {
343   MatCUSPARSEStorageFormat format;
344   PetscErrorCode           ierr;
345   PetscBool                flg;
346   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
347   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
348 
349   PetscFunctionBegin;
350   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
351   if (A->factortype==MAT_FACTOR_NONE) {
352     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
353                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
354     if (flg) {
355       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
356     }
357     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
358                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
359     if (flg) {
360       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
361     }
362     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
363                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
364     if (flg) {
365       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
366     }
367   }
368   ierr = PetscOptionsTail();CHKERRQ(ierr);
369   PetscFunctionReturn(0);
370 }
371 
372 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
373 {
374   PetscErrorCode             ierr;
375   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
376   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
377   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
378   PetscFunctionBegin;
379   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
380   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
381     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
382   }
383   if (d_mat) {
384     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
385   }
386 
387   PetscFunctionReturn(0);
388 }
389 
390 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
391 {
392   PetscErrorCode     ierr;
393   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
394   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
395   cudaError_t        err;
396   cusparseStatus_t   stat;
397 
398   PetscFunctionBegin;
399   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
400   if (cusparseStruct->deviceMat) {
401     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
402     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
403     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
404     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
405     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
406     if (jaca->compressedrow.use) {
407       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
408     }
409     if (jacb->compressedrow.use) {
410       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
411     }
412     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
413     err = cudaFree(d_mat);CHKERRCUDA(err);
414   }
415   try {
416     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
417     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
418     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
419     if (cusparseStruct->stream) {
420       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
421     }
422     delete cusparseStruct->coo_p;
423     delete cusparseStruct->coo_pw;
424     delete cusparseStruct;
425   } catch(char *ex) {
426     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
427   }
428   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
429   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
430   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
431   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
432   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
437 {
438   PetscErrorCode     ierr;
439   Mat_MPIAIJ         *a;
440   Mat_MPIAIJCUSPARSE *cusparseStruct;
441   cusparseStatus_t   stat;
442   Mat                A;
443 
444   PetscFunctionBegin;
445   if (reuse == MAT_INITIAL_MATRIX) {
446     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
447   } else if (reuse == MAT_REUSE_MATRIX) {
448     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
449   }
450   A = *newmat;
451 
452   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
454 
455   a = (Mat_MPIAIJ*)A->data;
456   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
457   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
458   if (a->lvec) {
459     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
460   }
461 
462   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
463     a->spptr = new Mat_MPIAIJCUSPARSE;
464 
465     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
466     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
467     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
468     cusparseStruct->coo_p               = NULL;
469     cusparseStruct->coo_pw              = NULL;
470     cusparseStruct->stream              = 0;
471     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
472     cusparseStruct->deviceMat = NULL;
473   }
474 
475   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
476   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
477   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
478   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
479   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
480   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
481   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
482 
483   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
484   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
485   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
486   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
487   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
488   PetscFunctionReturn(0);
489 }
490 
491 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
492 {
493   PetscErrorCode ierr;
494 
495   PetscFunctionBegin;
496   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
497   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
498   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
499   PetscFunctionReturn(0);
500 }
501 
502 /*@
503    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
504    (the default parallel PETSc format).  This matrix will ultimately pushed down
505    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
506    assembly performance the user should preallocate the matrix storage by setting
507    the parameter nz (or the array nnz).  By setting these parameters accurately,
508    performance during matrix assembly can be increased by more than a factor of 50.
509 
510    Collective
511 
512    Input Parameters:
513 +  comm - MPI communicator, set to PETSC_COMM_SELF
514 .  m - number of rows
515 .  n - number of columns
516 .  nz - number of nonzeros per row (same for all rows)
517 -  nnz - array containing the number of nonzeros in the various rows
518          (possibly different for each row) or NULL
519 
520    Output Parameter:
521 .  A - the matrix
522 
523    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
524    MatXXXXSetPreallocation() paradigm instead of this routine directly.
525    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
526 
527    Notes:
528    If nnz is given then nz is ignored
529 
530    The AIJ format (also called the Yale sparse matrix format or
531    compressed row storage), is fully compatible with standard Fortran 77
532    storage.  That is, the stored row and column indices can begin at
533    either one (as in Fortran) or zero.  See the users' manual for details.
534 
535    Specify the preallocated storage with either nz or nnz (not both).
536    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
537    allocation.  For large problems you MUST preallocate memory or you
538    will get TERRIBLE performance, see the users' manual chapter on matrices.
539 
540    By default, this format uses inodes (identical nodes) when possible, to
541    improve numerical efficiency of matrix-vector products and solves. We
542    search for consecutive rows with the same nonzero structure, thereby
543    reusing matrix information to achieve increased efficiency.
544 
545    Level: intermediate
546 
547 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
548 @*/
549 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
550 {
551   PetscErrorCode ierr;
552   PetscMPIInt    size;
553 
554   PetscFunctionBegin;
555   ierr = MatCreate(comm,A);CHKERRQ(ierr);
556   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
557   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
558   if (size > 1) {
559     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
560     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
561   } else {
562     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
563     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
564   }
565   PetscFunctionReturn(0);
566 }
567 
568 /*MC
569    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
570 
571    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
572    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
573    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
574 
575    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
576    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
577    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
578    for communicators controlling multiple processes.  It is recommended that you call both of
579    the above preallocation routines for simplicity.
580 
581    Options Database Keys:
582 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
583 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
584 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
585 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
586 
587   Level: beginner
588 
589  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
590 M
591 M*/
592 
593 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
594 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
595 {
596 #if defined(PETSC_USE_CTABLE)
597   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
598 #else
599   PetscSplitCSRDataStructure **p_d_mat;
600   PetscMPIInt                size,rank;
601   MPI_Comm                   comm;
602   PetscErrorCode             ierr;
603   int                        *ai,*bi,*aj,*bj;
604   PetscScalar                *aa,*ba;
605 
606   PetscFunctionBegin;
607   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
608   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
609   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
610   if (A->factortype == MAT_FACTOR_NONE) {
611     CsrMatrix *matrixA,*matrixB=NULL;
612     if (size == 1) {
613       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
614       p_d_mat = &cusparsestruct->deviceMat;
615       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
616       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
617 	matrixA = (CsrMatrix*)matstruct->mat;
618 	bi = bj = NULL; ba = NULL;
619       } else {
620 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
621       }
622     } else {
623       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
624       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
625       p_d_mat = &spptr->deviceMat;
626       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
627       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
628       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
629       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
630       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
631 	if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
632 	matrixA = (CsrMatrix*)matstructA->mat;
633 	matrixB = (CsrMatrix*)matstructB->mat;
634 	bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
635 	bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
636 	ba = thrust::raw_pointer_cast(matrixB->values->data());
637 	if (rank==-1) {
638 	  for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
639 	    std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
640 	  for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
641 	    std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
642 	  for(unsigned int i = 0; i < matrixB->values->size(); i++)
643 	    std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
644 	}
645       } else {
646 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
647       }
648     }
649     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
650     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
651     aa = thrust::raw_pointer_cast(matrixA->values->data());
652   } else {
653     *B = NULL;
654     PetscFunctionReturn(0);
655   }
656   // act like MatSetValues because not called on host
657   if (A->assembled) {
658     if (A->was_assembled) {
659       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
660     }
661     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
662   } else {
663     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
664   }
665   if (!*p_d_mat) {
666     cudaError_t                 err;
667     PetscSplitCSRDataStructure  *d_mat, h_mat;
668     Mat_SeqAIJ                  *jaca;
669     PetscInt                    n = A->rmap->n, nnz;
670     // create and copy
671     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
672     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
673     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
674     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
675     if (size == 1) {
676       jaca = (Mat_SeqAIJ*)A->data;
677       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
678       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
679       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
680       h_mat.offdiag.a = NULL;
681     } else {
682       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
683       Mat_SeqAIJ  *jacb;
684       jaca = (Mat_SeqAIJ*)aij->A->data;
685       jacb = (Mat_SeqAIJ*)aij->B->data;
686       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
687       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
688       // create colmap - this is ussually done (lazy) in MatSetValues
689       aij->donotstash = PETSC_TRUE;
690       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
691       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
692       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
693       aij->colmap[A->cmap->N] = -9;
694       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
695       {
696 	PetscInt ii;
697 	for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
698       }
699       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
700       // allocate B copy data
701       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
702       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
703       nnz = jacb->i[n];
704 
705       if (jacb->compressedrow.use) {
706 	err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
707 	err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
708       } else {
709 	h_mat.offdiag.i = bi;
710       }
711       h_mat.offdiag.j = bj;
712       h_mat.offdiag.a = ba;
713 
714       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
715       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
716       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
717       h_mat.offdiag.n = n;
718     }
719     // allocate A copy data
720     nnz = jaca->i[n];
721     h_mat.diag.n = n;
722     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
723     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
724     if (jaca->compressedrow.use) {
725       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
726       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
727     } else {
728       h_mat.diag.i = ai;
729     }
730     h_mat.diag.j = aj;
731     h_mat.diag.a = aa;
732     // copy pointers and metdata to device
733     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
734     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
735   } else {
736     *B = *p_d_mat;
737   }
738   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
739   PetscFunctionReturn(0);
740 #endif
741 }
742