xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision a0e72f99b781a1b11acf6c40ba4a7ecafcc3e699)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28   cudaError_t        cerr;
29 
30   PetscFunctionBegin;
31   if (cusp->coo_p && v) {
32     thrust::device_ptr<const PetscScalar> d_v;
33     THRUSTARRAY                           *w = NULL;
34 
35     if (isCudaMem(v)) {
36       d_v = thrust::device_pointer_cast(v);
37     } else {
38       w = new THRUSTARRAY(n);
39       w->assign(v,v+n);
40       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
41       d_v = w->data();
42     }
43 
44     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
45                                                               cusp->coo_pw->begin()));
46     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
47                                                               cusp->coo_pw->end()));
48     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
49     thrust::for_each(zibit,zieit,VecCUDAEquals());
50     cerr = WaitForCUDA();CHKERRCUDA(cerr);
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               *jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n);
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
118   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
119   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
120   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
121   ierr = PetscFree(b->garray);CHKERRQ(ierr);
122   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
123   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
124   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
125 
126   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
127   d_i.assign(coo_i,coo_i+n);
128   d_j.assign(coo_j,coo_j+n);
129   delete cusp->coo_p;
130   delete cusp->coo_pw;
131   cusp->coo_p = NULL;
132   cusp->coo_pw = NULL;
133   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
134   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
135   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
136   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
137     cusp->coo_p = new THRUSTINTARRAY(n);
138     cusp->coo_pw = new THRUSTARRAY(n);
139     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
140     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
141     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
142     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
143     firstoffd = mzipp.get_iterator_tuple().get<1>();
144   }
145   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
146   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
147 
148   /* from global to local */
149   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
150   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
151   cerr = WaitForCUDA();CHKERRCUDA(cerr);
152   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
153 
154   /* copy offdiag column indices to map on the CPU */
155   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
156   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
157   auto o_j = d_j.begin();
158   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
159   thrust::advance(o_j,cusp->coo_nd);
160   thrust::sort(thrust::device,o_j,d_j.end());
161   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
162   cerr = WaitForCUDA();CHKERRCUDA(cerr);
163   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
164   noff = thrust::distance(o_j,wit);
165   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
166   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
167   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
168   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
169   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
170   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
171   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
172   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
173 
174   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
175   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
176   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
177   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
178   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
179   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
180   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
181   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
182 
183   /* GPU memory, cusparse specific call handles it internally */
184   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
185   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
186   ierr = PetscFree(jj);CHKERRQ(ierr);
187 
188   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
190   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
191   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
192   /*
193   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
194   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
195   */
196   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
197   B->preallocated = PETSC_TRUE;
198   B->nonzerostate++;
199 
200   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
201   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
202   B->offloadmask = PETSC_OFFLOAD_CPU;
203   B->assembled = PETSC_FALSE;
204   B->was_assembled = PETSC_FALSE;
205   PetscFunctionReturn(0);
206 }
207 
208 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
209 {
210   Mat            Ad,Ao;
211   const PetscInt *cmap;
212   PetscErrorCode ierr;
213 
214   PetscFunctionBegin;
215   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
216   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
217   if (glob) {
218     PetscInt cst, i, dn, on, *gidx;
219 
220     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
221     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
222     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
223     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
224     for (i=0; i<dn; i++) gidx[i]    = cst + i;
225     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
226     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
227   }
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
232 {
233   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
234   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
235   PetscErrorCode     ierr;
236   PetscInt           i;
237 
238   PetscFunctionBegin;
239   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
240   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
241   if (PetscDefined(USE_DEBUG) && d_nnz) {
242     for (i=0; i<B->rmap->n; i++) {
243       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
244     }
245   }
246   if (PetscDefined(USE_DEBUG) && o_nnz) {
247     for (i=0; i<B->rmap->n; i++) {
248       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
249     }
250   }
251 #if defined(PETSC_USE_CTABLE)
252   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
253 #else
254   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
255 #endif
256   ierr = PetscFree(b->garray);CHKERRQ(ierr);
257   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
258   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
259   /* Because the B will have been resized we simply destroy it and create a new one each time */
260   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
261   if (!b->A) {
262     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
263     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
264     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
265   }
266   if (!b->B) {
267     PetscMPIInt size;
268     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
269     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
270     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
271     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
272   }
273   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
274   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
275   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
276   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
277   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
278   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
279   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
281   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
282   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
283   /* Let A, B use b's handle with pre-set stream
284   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
285   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
286   */
287   B->preallocated = PETSC_TRUE;
288   PetscFunctionReturn(0);
289 }
290 
291 /*@
292    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
293 
294    Not collective
295 
296    Input Parameters:
297 +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
298 -  gen - the boolean flag
299 
300    Level: intermediate
301 
302 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
303 @*/
304 PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
305 {
306   PetscErrorCode ierr;
307   PetscBool      ismpiaij;
308 
309   PetscFunctionBegin;
310   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
311   MatCheckPreallocated(A,1);
312   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
313   if (ismpiaij) {
314     Mat A_d,A_o;
315 
316     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
317     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
318     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
319   } else {
320     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
321   }
322   PetscFunctionReturn(0);
323 }
324 
325 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
326 {
327   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
328   PetscErrorCode ierr;
329   PetscInt       nt;
330 
331   PetscFunctionBegin;
332   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
333   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
334   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
335   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
336   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
337   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
338   PetscFunctionReturn(0);
339 }
340 
341 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
342 {
343   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
344   PetscErrorCode ierr;
345 
346   PetscFunctionBegin;
347   if (A->factortype == MAT_FACTOR_NONE) {
348     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
349     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
350     if (d_mat) {
351       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
352       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
353       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
354       cudaError_t  err;
355       PetscScalar  *vals;
356       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
357       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
358       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
359       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
360       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
361     }
362   }
363   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
364   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
365 
366   PetscFunctionReturn(0);
367 }
368 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
369 {
370   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
371   PetscErrorCode ierr;
372   PetscInt       nt;
373 
374   PetscFunctionBegin;
375   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
376   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
377   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
378   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
379   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
380   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
385 {
386   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
387   PetscErrorCode ierr;
388   PetscInt       nt;
389 
390   PetscFunctionBegin;
391   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
392   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
393   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
394   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
395   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
396   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
401 {
402   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
403   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
404 
405   PetscFunctionBegin;
406   switch (op) {
407   case MAT_CUSPARSE_MULT_DIAG:
408     cusparseStruct->diagGPUMatFormat = format;
409     break;
410   case MAT_CUSPARSE_MULT_OFFDIAG:
411     cusparseStruct->offdiagGPUMatFormat = format;
412     break;
413   case MAT_CUSPARSE_ALL:
414     cusparseStruct->diagGPUMatFormat    = format;
415     cusparseStruct->offdiagGPUMatFormat = format;
416     break;
417   default:
418     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
419   }
420   PetscFunctionReturn(0);
421 }
422 
423 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
424 {
425   MatCUSPARSEStorageFormat format;
426   PetscErrorCode           ierr;
427   PetscBool                flg;
428   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
429   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
430 
431   PetscFunctionBegin;
432   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
433   if (A->factortype==MAT_FACTOR_NONE) {
434     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
435                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
436     if (flg) {
437       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
438     }
439     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
440                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
441     if (flg) {
442       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
443     }
444     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
445                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
446     if (flg) {
447       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
448     }
449   }
450   ierr = PetscOptionsTail();CHKERRQ(ierr);
451   PetscFunctionReturn(0);
452 }
453 
454 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
455 {
456   PetscErrorCode             ierr;
457   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
458   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
459   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
460   PetscFunctionBegin;
461   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
462   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
463     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
464   }
465   if (d_mat) {
466     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
467   }
468 
469   PetscFunctionReturn(0);
470 }
471 
472 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
473 {
474   PetscErrorCode     ierr;
475   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
476   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
477   cudaError_t        err;
478   cusparseStatus_t   stat;
479 
480   PetscFunctionBegin;
481   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
482   if (cusparseStruct->deviceMat) {
483     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
484     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
485     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
486     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
487     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
488     if (jaca->compressedrow.use) {
489       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
490     }
491     if (jacb->compressedrow.use) {
492       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
493     }
494     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
495     err = cudaFree(d_mat);CHKERRCUDA(err);
496   }
497   try {
498     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
499     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
500     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
501     /* We want cusparseStruct to use PetscDefaultCudaStream
502     if (cusparseStruct->stream) {
503       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
504     }
505     */
506     delete cusparseStruct->coo_p;
507     delete cusparseStruct->coo_pw;
508     delete cusparseStruct;
509   } catch(char *ex) {
510     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
511   }
512   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
513   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
514   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
515   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
516   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
517   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
518   PetscFunctionReturn(0);
519 }
520 
521 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
522 {
523   PetscErrorCode     ierr;
524   Mat_MPIAIJ         *a;
525   Mat_MPIAIJCUSPARSE *cusparseStruct;
526   cusparseStatus_t   stat;
527   Mat                A;
528 
529   PetscFunctionBegin;
530   if (reuse == MAT_INITIAL_MATRIX) {
531     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
532   } else if (reuse == MAT_REUSE_MATRIX) {
533     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
534   }
535   A = *newmat;
536   A->boundtocpu = PETSC_FALSE;
537   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
538   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
539 
540   a = (Mat_MPIAIJ*)A->data;
541   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
542   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
543   if (a->lvec) {
544     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
545   }
546 
547   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
548     a->spptr = new Mat_MPIAIJCUSPARSE;
549 
550     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
551     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
552     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
553     cusparseStruct->coo_p               = NULL;
554     cusparseStruct->coo_pw              = NULL;
555     cusparseStruct->stream              = 0; /* We should not need cusparseStruct->stream */
556     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
557     stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
558     cusparseStruct->deviceMat = NULL;
559   }
560 
561   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
562   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
563   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
564   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
565   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
566   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
567   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
568   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
569 
570   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
571   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
572   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
573   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
574   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
575   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 
579 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
580 {
581   PetscErrorCode ierr;
582 
583   PetscFunctionBegin;
584   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
585   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
586   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
587   PetscFunctionReturn(0);
588 }
589 
590 /*@
591    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
592    (the default parallel PETSc format).  This matrix will ultimately pushed down
593    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
594    assembly performance the user should preallocate the matrix storage by setting
595    the parameter nz (or the array nnz).  By setting these parameters accurately,
596    performance during matrix assembly can be increased by more than a factor of 50.
597 
598    Collective
599 
600    Input Parameters:
601 +  comm - MPI communicator, set to PETSC_COMM_SELF
602 .  m - number of rows
603 .  n - number of columns
604 .  nz - number of nonzeros per row (same for all rows)
605 -  nnz - array containing the number of nonzeros in the various rows
606          (possibly different for each row) or NULL
607 
608    Output Parameter:
609 .  A - the matrix
610 
611    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
612    MatXXXXSetPreallocation() paradigm instead of this routine directly.
613    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
614 
615    Notes:
616    If nnz is given then nz is ignored
617 
618    The AIJ format (also called the Yale sparse matrix format or
619    compressed row storage), is fully compatible with standard Fortran 77
620    storage.  That is, the stored row and column indices can begin at
621    either one (as in Fortran) or zero.  See the users' manual for details.
622 
623    Specify the preallocated storage with either nz or nnz (not both).
624    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
625    allocation.  For large problems you MUST preallocate memory or you
626    will get TERRIBLE performance, see the users' manual chapter on matrices.
627 
628    By default, this format uses inodes (identical nodes) when possible, to
629    improve numerical efficiency of matrix-vector products and solves. We
630    search for consecutive rows with the same nonzero structure, thereby
631    reusing matrix information to achieve increased efficiency.
632 
633    Level: intermediate
634 
635 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
636 @*/
637 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
638 {
639   PetscErrorCode ierr;
640   PetscMPIInt    size;
641 
642   PetscFunctionBegin;
643   ierr = MatCreate(comm,A);CHKERRQ(ierr);
644   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
645   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
646   if (size > 1) {
647     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
648     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
649   } else {
650     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
651     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
652   }
653   PetscFunctionReturn(0);
654 }
655 
656 /*MC
657    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
658 
659    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
660    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
661    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
662 
663    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
664    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
665    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
666    for communicators controlling multiple processes.  It is recommended that you call both of
667    the above preallocation routines for simplicity.
668 
669    Options Database Keys:
670 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
671 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
672 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
673 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
674 
675   Level: beginner
676 
677  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
678 M
679 M*/
680 
681 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
682 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
683 {
684 #if defined(PETSC_USE_CTABLE)
685   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
686 #else
687   PetscSplitCSRDataStructure **p_d_mat;
688   PetscMPIInt                size,rank;
689   MPI_Comm                   comm;
690   PetscErrorCode             ierr;
691   int                        *ai,*bi,*aj,*bj;
692   PetscScalar                *aa,*ba;
693 
694   PetscFunctionBegin;
695   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
696   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
697   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
698   if (A->factortype == MAT_FACTOR_NONE) {
699     CsrMatrix *matrixA,*matrixB=NULL;
700     if (size == 1) {
701       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
702       p_d_mat = &cusparsestruct->deviceMat;
703       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
704       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
705         matrixA = (CsrMatrix*)matstruct->mat;
706         bi = bj = NULL; ba = NULL;
707       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
708     } else {
709       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
710       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
711       p_d_mat = &spptr->deviceMat;
712       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
713       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
714       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
715       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
716       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
717         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
718         matrixA = (CsrMatrix*)matstructA->mat;
719         matrixB = (CsrMatrix*)matstructB->mat;
720         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
721         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
722         ba = thrust::raw_pointer_cast(matrixB->values->data());
723       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
724     }
725     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
726     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
727     aa = thrust::raw_pointer_cast(matrixA->values->data());
728   } else {
729     *B = NULL;
730     PetscFunctionReturn(0);
731   }
732   // act like MatSetValues because not called on host
733   if (A->assembled) {
734     if (A->was_assembled) {
735       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
736     }
737     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
738   } else {
739     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
740   }
741   if (!*p_d_mat) {
742     cudaError_t                 err;
743     PetscSplitCSRDataStructure  *d_mat, h_mat;
744     Mat_SeqAIJ                  *jaca;
745     PetscInt                    n = A->rmap->n, nnz;
746     // create and copy
747     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
748     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
749     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
750     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
751     if (size == 1) {
752       jaca = (Mat_SeqAIJ*)A->data;
753       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
754       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
755       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
756       h_mat.offdiag.a = NULL;
757     } else {
758       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
759       Mat_SeqAIJ  *jacb;
760       jaca = (Mat_SeqAIJ*)aij->A->data;
761       jacb = (Mat_SeqAIJ*)aij->B->data;
762       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
763       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
764       // create colmap - this is ussually done (lazy) in MatSetValues
765       aij->donotstash = PETSC_TRUE;
766       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
767       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
768       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
769       aij->colmap[A->cmap->N] = -9;
770       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
771       {
772         PetscInt ii;
773         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
774       }
775       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
776       // allocate B copy data
777       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
778       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
779       nnz = jacb->i[n];
780 
781       if (jacb->compressedrow.use) {
782         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
783         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
784       } else h_mat.offdiag.i = bi;
785       h_mat.offdiag.j = bj;
786       h_mat.offdiag.a = ba;
787 
788       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
789       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
790       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
791       h_mat.offdiag.n = n;
792     }
793     // allocate A copy data
794     nnz = jaca->i[n];
795     h_mat.diag.n = n;
796     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
797     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
798     if (jaca->compressedrow.use) {
799       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
800       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
801     } else {
802       h_mat.diag.i = ai;
803     }
804     h_mat.diag.j = aj;
805     h_mat.diag.a = aa;
806     // copy pointers and metdata to device
807     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
808     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
809   } else {
810     *B = *p_d_mat;
811   }
812   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
813   PetscFunctionReturn(0);
814 #endif
815 }
816