xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 4a7137268e3693434874b221cb7dc4fdfbd92ed4)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28   cudaError_t        cerr;
29 
30   PetscFunctionBegin;
31   if (cusp->coo_p && v) {
32     thrust::device_ptr<const PetscScalar> d_v;
33     THRUSTARRAY                           *w = NULL;
34 
35     if (isCudaMem(v)) {
36       d_v = thrust::device_pointer_cast(v);
37     } else {
38       w = new THRUSTARRAY(n);
39       w->assign(v,v+n);
40       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
41       d_v = w->data();
42     }
43 
44     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
45                                                               cusp->coo_pw->begin()));
46     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
47                                                               cusp->coo_pw->end()));
48     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
49     thrust::for_each(zibit,zieit,VecCUDAEquals());
50     cerr = WaitForCUDA();CHKERRCUDA(cerr);
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               *jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n);
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
118   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
119   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
120   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
121   ierr = PetscFree(b->garray);CHKERRQ(ierr);
122   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
123   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
124   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
125 
126   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
127   d_i.assign(coo_i,coo_i+n);
128   d_j.assign(coo_j,coo_j+n);
129   delete cusp->coo_p;
130   delete cusp->coo_pw;
131   cusp->coo_p = NULL;
132   cusp->coo_pw = NULL;
133   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
134   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
135   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
136   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
137     cusp->coo_p = new THRUSTINTARRAY(n);
138     cusp->coo_pw = new THRUSTARRAY(n);
139     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
140     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
141     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
142     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
143     firstoffd = mzipp.get_iterator_tuple().get<1>();
144   }
145   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
146   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
147 
148   /* from global to local */
149   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
150   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
151   cerr = WaitForCUDA();CHKERRCUDA(cerr);
152   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
153 
154   /* copy offdiag column indices to map on the CPU */
155   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
156   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
157   auto o_j = d_j.begin();
158   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
159   thrust::advance(o_j,cusp->coo_nd);
160   thrust::sort(thrust::device,o_j,d_j.end());
161   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
162   cerr = WaitForCUDA();CHKERRCUDA(cerr);
163   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
164   noff = thrust::distance(o_j,wit);
165   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
166   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
167   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
168   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
169   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
170   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
171   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
172   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
173 
174   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
175   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
176   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
177   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
178   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
179   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
180   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
181   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
182 
183   /* GPU memory, cusparse specific call handles it internally */
184   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
185   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
186   ierr = PetscFree(jj);CHKERRQ(ierr);
187 
188   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
190   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
191   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
192   /*
193   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
194   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
195   */
196   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
197   B->preallocated = PETSC_TRUE;
198   B->nonzerostate++;
199 
200   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
201   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
202   B->offloadmask = PETSC_OFFLOAD_CPU;
203   B->assembled = PETSC_FALSE;
204   B->was_assembled = PETSC_FALSE;
205   PetscFunctionReturn(0);
206 }
207 
208 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
209 {
210   Mat            Ad,Ao;
211   const PetscInt *cmap;
212   PetscErrorCode ierr;
213 
214   PetscFunctionBegin;
215   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
216   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
217   if (glob) {
218     PetscInt cst, i, dn, on, *gidx;
219 
220     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
221     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
222     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
223     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
224     for (i=0; i<dn; i++) gidx[i]    = cst + i;
225     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
226     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
227   }
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
232 {
233   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
234   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
235   PetscErrorCode     ierr;
236   PetscInt           i;
237 
238   PetscFunctionBegin;
239   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
240   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
241   if (PetscDefined(USE_DEBUG) && d_nnz) {
242     for (i=0; i<B->rmap->n; i++) {
243       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
244     }
245   }
246   if (PetscDefined(USE_DEBUG) && o_nnz) {
247     for (i=0; i<B->rmap->n; i++) {
248       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
249     }
250   }
251 #if defined(PETSC_USE_CTABLE)
252   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
253 #else
254   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
255 #endif
256   ierr = PetscFree(b->garray);CHKERRQ(ierr);
257   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
258   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
259   /* Because the B will have been resized we simply destroy it and create a new one each time */
260   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
261   if (!b->A) {
262     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
263     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
264     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
265   }
266   if (!b->B) {
267     PetscMPIInt size;
268     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
269     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
270     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
271     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
272   }
273   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
274   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
275   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
276   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
277   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
278   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
279   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
281   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
282   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
283   /* Let A, B use b's handle with pre-set stream
284   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
285   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
286   */
287   B->preallocated = PETSC_TRUE;
288   PetscFunctionReturn(0);
289 }
290 
291 /*@
292    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
293 
294    Not collective
295 
296    Input Parameters:
297 +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
298 -  gen - the boolean flag
299 
300    Level: intermediate
301 
302 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
303 @*/
304 PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
305 {
306   PetscErrorCode ierr;
307   PetscBool      ismpiaij;
308 
309   PetscFunctionBegin;
310   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
311   MatCheckPreallocated(A,1);
312   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
313   if (ismpiaij) {
314     Mat A_d,A_o;
315 
316     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
317     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
318     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
319   } else {
320     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
321   }
322   PetscFunctionReturn(0);
323 }
324 
325 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
326 {
327   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
328   PetscErrorCode ierr;
329   PetscInt       nt;
330 
331   PetscFunctionBegin;
332   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
333   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
334   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
335   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
336   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
337   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
338   PetscFunctionReturn(0);
339 }
340 
341 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
342 {
343   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
344   PetscErrorCode ierr;
345 
346   PetscFunctionBegin;
347   if (A->factortype == MAT_FACTOR_NONE) {
348     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
349     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
350     if (d_mat) {
351       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
352       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
353       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
354       cudaError_t  err;
355       PetscScalar  *vals;
356       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
357       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
358       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
359       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
360       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
361     }
362   }
363   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
364   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
365 
366   PetscFunctionReturn(0);
367 }
368 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
369 {
370   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
371   PetscErrorCode ierr;
372   PetscInt       nt;
373 
374   PetscFunctionBegin;
375   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
376   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
377   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
378   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
379   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
380   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
385 {
386   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
387   PetscErrorCode ierr;
388   PetscInt       nt;
389 
390   PetscFunctionBegin;
391   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
392   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
393   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
394   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
395   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
396   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
401 {
402   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
403   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
404 
405   PetscFunctionBegin;
406   switch (op) {
407   case MAT_CUSPARSE_MULT_DIAG:
408     cusparseStruct->diagGPUMatFormat = format;
409     break;
410   case MAT_CUSPARSE_MULT_OFFDIAG:
411     cusparseStruct->offdiagGPUMatFormat = format;
412     break;
413   case MAT_CUSPARSE_ALL:
414     cusparseStruct->diagGPUMatFormat    = format;
415     cusparseStruct->offdiagGPUMatFormat = format;
416     break;
417   default:
418     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
419   }
420   PetscFunctionReturn(0);
421 }
422 
423 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
424 {
425   MatCUSPARSEStorageFormat format;
426   PetscErrorCode           ierr;
427   PetscBool                flg;
428   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
429   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
430 
431   PetscFunctionBegin;
432   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
433   if (A->factortype==MAT_FACTOR_NONE) {
434     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
435                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
436     if (flg) {
437       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
438     }
439     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
440                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
441     if (flg) {
442       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
443     }
444     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
445                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
446     if (flg) {
447       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
448     }
449   }
450   ierr = PetscOptionsTail();CHKERRQ(ierr);
451   PetscFunctionReturn(0);
452 }
453 
454 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
455 {
456   PetscErrorCode             ierr;
457   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
458   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
459   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
460   PetscFunctionBegin;
461   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
462   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
463     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
464    #if defined(PETSC_HAVE_NVSHMEM)
465     {
466       PetscMPIInt result;
467       PetscBool   useNvshmem = PETSC_FALSE;
468       ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr);
469       if (useNvshmem) {
470         ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr);
471         if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);}
472       }
473     }
474    #endif
475   }
476   if (d_mat) {
477     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
478   }
479   PetscFunctionReturn(0);
480 }
481 
482 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
483 {
484   PetscErrorCode     ierr;
485   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
486   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
487   cudaError_t        err;
488   cusparseStatus_t   stat;
489 
490   PetscFunctionBegin;
491   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
492   if (cusparseStruct->deviceMat) {
493     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
494     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
495     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
496     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
497     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
498     if (jaca->compressedrow.use) {
499       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
500     }
501     if (jacb->compressedrow.use) {
502       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
503     }
504     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
505     err = cudaFree(d_mat);CHKERRCUDA(err);
506   }
507   try {
508     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
509     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
510     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
511     /* We want cusparseStruct to use PetscDefaultCudaStream
512     if (cusparseStruct->stream) {
513       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
514     }
515     */
516     delete cusparseStruct->coo_p;
517     delete cusparseStruct->coo_pw;
518     delete cusparseStruct;
519   } catch(char *ex) {
520     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
521   }
522   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
523   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
524   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
525   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
526   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
527   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 
531 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
532 {
533   PetscErrorCode     ierr;
534   Mat_MPIAIJ         *a;
535   Mat_MPIAIJCUSPARSE *cusparseStruct;
536   cusparseStatus_t   stat;
537   Mat                A;
538 
539   PetscFunctionBegin;
540   if (reuse == MAT_INITIAL_MATRIX) {
541     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
542   } else if (reuse == MAT_REUSE_MATRIX) {
543     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
544   }
545   A = *newmat;
546   A->boundtocpu = PETSC_FALSE;
547   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
548   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
549 
550   a = (Mat_MPIAIJ*)A->data;
551   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
552   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
553   if (a->lvec) {
554     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
555   }
556 
557   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
558     a->spptr = new Mat_MPIAIJCUSPARSE;
559 
560     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
561     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
562     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
563     cusparseStruct->coo_p               = NULL;
564     cusparseStruct->coo_pw              = NULL;
565     cusparseStruct->stream              = 0; /* We should not need cusparseStruct->stream */
566     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
567     stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
568     cusparseStruct->deviceMat = NULL;
569   }
570 
571   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
572   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
573   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
574   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
575   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
576   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
577   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
578   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
579 
580   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
581   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
582   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
583   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
584   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
585   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
586   PetscFunctionReturn(0);
587 }
588 
589 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
590 {
591   PetscErrorCode ierr;
592 
593   PetscFunctionBegin;
594   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
595   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
596   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
597   PetscFunctionReturn(0);
598 }
599 
600 /*@
601    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
602    (the default parallel PETSc format).  This matrix will ultimately pushed down
603    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
604    assembly performance the user should preallocate the matrix storage by setting
605    the parameter nz (or the array nnz).  By setting these parameters accurately,
606    performance during matrix assembly can be increased by more than a factor of 50.
607 
608    Collective
609 
610    Input Parameters:
611 +  comm - MPI communicator, set to PETSC_COMM_SELF
612 .  m - number of rows
613 .  n - number of columns
614 .  nz - number of nonzeros per row (same for all rows)
615 -  nnz - array containing the number of nonzeros in the various rows
616          (possibly different for each row) or NULL
617 
618    Output Parameter:
619 .  A - the matrix
620 
621    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
622    MatXXXXSetPreallocation() paradigm instead of this routine directly.
623    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
624 
625    Notes:
626    If nnz is given then nz is ignored
627 
628    The AIJ format (also called the Yale sparse matrix format or
629    compressed row storage), is fully compatible with standard Fortran 77
630    storage.  That is, the stored row and column indices can begin at
631    either one (as in Fortran) or zero.  See the users' manual for details.
632 
633    Specify the preallocated storage with either nz or nnz (not both).
634    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
635    allocation.  For large problems you MUST preallocate memory or you
636    will get TERRIBLE performance, see the users' manual chapter on matrices.
637 
638    By default, this format uses inodes (identical nodes) when possible, to
639    improve numerical efficiency of matrix-vector products and solves. We
640    search for consecutive rows with the same nonzero structure, thereby
641    reusing matrix information to achieve increased efficiency.
642 
643    Level: intermediate
644 
645 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
646 @*/
647 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
648 {
649   PetscErrorCode ierr;
650   PetscMPIInt    size;
651 
652   PetscFunctionBegin;
653   ierr = MatCreate(comm,A);CHKERRQ(ierr);
654   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
655   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
656   if (size > 1) {
657     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
658     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
659   } else {
660     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
661     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
662   }
663   PetscFunctionReturn(0);
664 }
665 
666 /*MC
667    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
668 
669    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
670    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
671    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
672 
673    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
674    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
675    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
676    for communicators controlling multiple processes.  It is recommended that you call both of
677    the above preallocation routines for simplicity.
678 
679    Options Database Keys:
680 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
681 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
682 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
683 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
684 
685   Level: beginner
686 
687  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
688 M
689 M*/
690 
691 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
692 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
693 {
694 #if defined(PETSC_USE_CTABLE)
695   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
696 #else
697   PetscSplitCSRDataStructure **p_d_mat;
698   PetscMPIInt                size,rank;
699   MPI_Comm                   comm;
700   PetscErrorCode             ierr;
701   int                        *ai,*bi,*aj,*bj;
702   PetscScalar                *aa,*ba;
703 
704   PetscFunctionBegin;
705   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
706   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
707   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
708   if (A->factortype == MAT_FACTOR_NONE) {
709     CsrMatrix *matrixA,*matrixB=NULL;
710     if (size == 1) {
711       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
712       p_d_mat = &cusparsestruct->deviceMat;
713       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
714       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
715         matrixA = (CsrMatrix*)matstruct->mat;
716         bi = bj = NULL; ba = NULL;
717       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
718     } else {
719       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
720       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
721       p_d_mat = &spptr->deviceMat;
722       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
723       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
724       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
725       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
726       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
727         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
728         matrixA = (CsrMatrix*)matstructA->mat;
729         matrixB = (CsrMatrix*)matstructB->mat;
730         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
731         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
732         ba = thrust::raw_pointer_cast(matrixB->values->data());
733       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
734     }
735     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
736     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
737     aa = thrust::raw_pointer_cast(matrixA->values->data());
738   } else {
739     *B = NULL;
740     PetscFunctionReturn(0);
741   }
742   // act like MatSetValues because not called on host
743   if (A->assembled) {
744     if (A->was_assembled) {
745       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
746     }
747     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
748   } else {
749     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
750   }
751   if (!*p_d_mat) {
752     cudaError_t                 err;
753     PetscSplitCSRDataStructure  *d_mat, h_mat;
754     Mat_SeqAIJ                  *jaca;
755     PetscInt                    n = A->rmap->n, nnz;
756     // create and copy
757     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
758     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
759     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
760     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
761     if (size == 1) {
762       jaca = (Mat_SeqAIJ*)A->data;
763       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
764       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
765       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
766       h_mat.offdiag.a = NULL;
767     } else {
768       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
769       Mat_SeqAIJ  *jacb;
770       jaca = (Mat_SeqAIJ*)aij->A->data;
771       jacb = (Mat_SeqAIJ*)aij->B->data;
772       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
773       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
774       // create colmap - this is ussually done (lazy) in MatSetValues
775       aij->donotstash = PETSC_TRUE;
776       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
777       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
778       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
779       aij->colmap[A->cmap->N] = -9;
780       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
781       {
782         PetscInt ii;
783         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
784       }
785       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
786       // allocate B copy data
787       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
788       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
789       nnz = jacb->i[n];
790 
791       if (jacb->compressedrow.use) {
792         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
793         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
794       } else h_mat.offdiag.i = bi;
795       h_mat.offdiag.j = bj;
796       h_mat.offdiag.a = ba;
797 
798       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
799       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
800       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
801       h_mat.offdiag.n = n;
802     }
803     // allocate A copy data
804     nnz = jaca->i[n];
805     h_mat.diag.n = n;
806     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
807     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
808     if (jaca->compressedrow.use) {
809       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
810       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
811     } else {
812       h_mat.diag.i = ai;
813     }
814     h_mat.diag.j = aj;
815     h_mat.diag.a = aa;
816     // copy pointers and metdata to device
817     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
818     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
819   } else {
820     *B = *p_d_mat;
821   }
822   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
823   PetscFunctionReturn(0);
824 #endif
825 }
826