xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 4e84afc0f1a4047b7fb785e275ab9ac0e73c2b80)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28   cudaError_t        cerr;
29 
30   PetscFunctionBegin;
31   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     cerr = WaitForCUDA();CHKERRCUDA(cerr);
52     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
53     delete w;
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
55     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
56   } else {
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
58     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
59   }
60   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
61   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
62   A->num_ass++;
63   A->assembled        = PETSC_TRUE;
64   A->ass_nonzerostate = A->nonzerostate;
65   A->offloadmask      = PETSC_OFFLOAD_GPU;
66   PetscFunctionReturn(0);
67 }
68 
69 template <typename Tuple>
70 struct IsNotOffDiagT
71 {
72   PetscInt _cstart,_cend;
73 
74   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
75   __host__ __device__
76   inline bool operator()(Tuple t)
77   {
78     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
79   }
80 };
81 
82 struct IsOffDiag
83 {
84   PetscInt _cstart,_cend;
85 
86   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
87   __host__ __device__
88   inline bool operator() (const PetscInt &c)
89   {
90     return c < _cstart || c >= _cend;
91   }
92 };
93 
94 struct GlobToLoc
95 {
96   PetscInt _start;
97 
98   GlobToLoc(PetscInt start) : _start(start) {}
99   __host__ __device__
100   inline PetscInt operator() (const PetscInt &c)
101   {
102     return c - _start;
103   }
104 };
105 
106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
107 {
108   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
109   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
110   PetscErrorCode         ierr;
111   PetscInt               *jj;
112   size_t                 noff = 0;
113   THRUSTINTARRAY         d_i(n);
114   THRUSTINTARRAY         d_j(n);
115   ISLocalToGlobalMapping l2g;
116   cudaError_t            cerr;
117 
118   PetscFunctionBegin;
119   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
120   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
121   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
122   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
123   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
124   ierr = PetscFree(b->garray);CHKERRQ(ierr);
125   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
126   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
127   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
128 
129   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
130   d_i.assign(coo_i,coo_i+n);
131   d_j.assign(coo_j,coo_j+n);
132   delete cusp->coo_p;
133   delete cusp->coo_pw;
134   cusp->coo_p = NULL;
135   cusp->coo_pw = NULL;
136   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
137   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
138   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
139   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
140     cusp->coo_p = new THRUSTINTARRAY(n);
141     cusp->coo_pw = new THRUSTARRAY(n);
142     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
143     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
144     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
145     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
146     firstoffd = mzipp.get_iterator_tuple().get<1>();
147   }
148   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
149   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
150 
151   /* from global to local */
152   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
153   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
154   cerr = WaitForCUDA();CHKERRCUDA(cerr);
155   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
156 
157   /* copy offdiag column indices to map on the CPU */
158   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
159   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
160   auto o_j = d_j.begin();
161   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
162   thrust::advance(o_j,cusp->coo_nd);
163   thrust::sort(thrust::device,o_j,d_j.end());
164   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
166   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
167   noff = thrust::distance(o_j,wit);
168   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
169   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
170   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
171   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
172   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
173   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
174   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
175   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
176 
177   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
178   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
179   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
180   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
181   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
182   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
183   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
184   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
185 
186   /* GPU memory, cusparse specific call handles it internally */
187   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
188   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
189   ierr = PetscFree(jj);CHKERRQ(ierr);
190 
191   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
192   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
193   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
194   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
195   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
196   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
197   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
198   B->preallocated = PETSC_TRUE;
199   B->nonzerostate++;
200   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
201 
202   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
203   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
204   B->offloadmask = PETSC_OFFLOAD_CPU;
205   B->assembled = PETSC_FALSE;
206   B->was_assembled = PETSC_FALSE;
207   PetscFunctionReturn(0);
208 }
209 
210 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
211 {
212   Mat            Ad,Ao;
213   const PetscInt *cmap;
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
218   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
219   if (glob) {
220     PetscInt cst, i, dn, on, *gidx;
221 
222     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
223     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
224     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
225     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
226     for (i=0; i<dn; i++) gidx[i]    = cst + i;
227     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
228     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
234 {
235   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
236   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
237   PetscErrorCode     ierr;
238   PetscInt           i;
239 
240   PetscFunctionBegin;
241   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
242   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
243   if (PetscDefined(USE_DEBUG) && d_nnz) {
244     for (i=0; i<B->rmap->n; i++) {
245       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
246     }
247   }
248   if (PetscDefined(USE_DEBUG) && o_nnz) {
249     for (i=0; i<B->rmap->n; i++) {
250       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
251     }
252   }
253 #if defined(PETSC_USE_CTABLE)
254   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
255 #else
256   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
257 #endif
258   ierr = PetscFree(b->garray);CHKERRQ(ierr);
259   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
260   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
261   /* Because the B will have been resized we simply destroy it and create a new one each time */
262   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
263   if (!b->A) {
264     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
265     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
266     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
267   }
268   if (!b->B) {
269     PetscMPIInt size;
270     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
271     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
272     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
273     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
274   }
275   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
276   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
277   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
278   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
279   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
280   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
281   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
282   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
283   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
284   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
285   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
286   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
287 
288   B->preallocated = PETSC_TRUE;
289   PetscFunctionReturn(0);
290 }
291 
292 /*@
293    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
294 
295    Not collective
296 
297    Input Parameters:
298 +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
299 -  gen - the boolean flag
300 
301    Level: intermediate
302 
303 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
304 @*/
305 PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
306 {
307   PetscErrorCode ierr;
308   PetscBool      ismpiaij;
309 
310   PetscFunctionBegin;
311   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
312   MatCheckPreallocated(A,1);
313   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
314   if (ismpiaij) {
315     Mat A_d,A_o;
316 
317     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
318     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
319     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
320   } else {
321     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
322   }
323   PetscFunctionReturn(0);
324 }
325 
326 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
327 {
328   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
329   PetscErrorCode ierr;
330   PetscInt       nt;
331 
332   PetscFunctionBegin;
333   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
334   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
335   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
336   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
337   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
338   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
339   PetscFunctionReturn(0);
340 }
341 
342 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
343 {
344   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
345   PetscErrorCode ierr;
346 
347   PetscFunctionBegin;
348   if (A->factortype == MAT_FACTOR_NONE) {
349     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
350     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
351     if (d_mat) {
352       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
353       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
354       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
355       cudaError_t  err;
356       PetscScalar  *vals;
357       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
358       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
359       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
360       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
361       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
362     }
363   }
364   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
365   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
366 
367   PetscFunctionReturn(0);
368 }
369 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
370 {
371   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
372   PetscErrorCode ierr;
373   PetscInt       nt;
374 
375   PetscFunctionBegin;
376   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
377   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
378   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
379   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
380   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
381   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
382   PetscFunctionReturn(0);
383 }
384 
385 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
386 {
387   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
388   PetscErrorCode ierr;
389   PetscInt       nt;
390 
391   PetscFunctionBegin;
392   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
393   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
394   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
395   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
396   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
397   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
398   PetscFunctionReturn(0);
399 }
400 
401 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
402 {
403   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
404   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
405 
406   PetscFunctionBegin;
407   switch (op) {
408   case MAT_CUSPARSE_MULT_DIAG:
409     cusparseStruct->diagGPUMatFormat = format;
410     break;
411   case MAT_CUSPARSE_MULT_OFFDIAG:
412     cusparseStruct->offdiagGPUMatFormat = format;
413     break;
414   case MAT_CUSPARSE_ALL:
415     cusparseStruct->diagGPUMatFormat    = format;
416     cusparseStruct->offdiagGPUMatFormat = format;
417     break;
418   default:
419     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
420   }
421   PetscFunctionReturn(0);
422 }
423 
424 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
425 {
426   MatCUSPARSEStorageFormat format;
427   PetscErrorCode           ierr;
428   PetscBool                flg;
429   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
430   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
431 
432   PetscFunctionBegin;
433   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
434   if (A->factortype==MAT_FACTOR_NONE) {
435     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
436                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
437     if (flg) {
438       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
439     }
440     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
441                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
442     if (flg) {
443       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
444     }
445     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
446                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
447     if (flg) {
448       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
449     }
450   }
451   ierr = PetscOptionsTail();CHKERRQ(ierr);
452   PetscFunctionReturn(0);
453 }
454 
455 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
456 {
457   PetscErrorCode             ierr;
458   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
459   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
460   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
461   PetscFunctionBegin;
462   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
463   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
464     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
465   }
466   if (d_mat) {
467     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
468   }
469 
470   PetscFunctionReturn(0);
471 }
472 
473 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
474 {
475   PetscErrorCode     ierr;
476   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
477   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
478   cudaError_t        err;
479   cusparseStatus_t   stat;
480 
481   PetscFunctionBegin;
482   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
483   if (cusparseStruct->deviceMat) {
484     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
485     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
486     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
487     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
488     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
489     if (jaca->compressedrow.use) {
490       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
491     }
492     if (jacb->compressedrow.use) {
493       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
494     }
495     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
496     err = cudaFree(d_mat);CHKERRCUDA(err);
497   }
498   try {
499     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
500     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
501     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
502     if (cusparseStruct->stream) {
503       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
504     }
505     delete cusparseStruct->coo_p;
506     delete cusparseStruct->coo_pw;
507     delete cusparseStruct;
508   } catch(char *ex) {
509     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
510   }
511   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
512   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
513   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
514   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
515   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
516   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
517   PetscFunctionReturn(0);
518 }
519 
520 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
521 {
522   PetscErrorCode     ierr;
523   Mat_MPIAIJ         *a;
524   Mat_MPIAIJCUSPARSE *cusparseStruct;
525   cusparseStatus_t   stat;
526   Mat                A;
527 
528   PetscFunctionBegin;
529   if (reuse == MAT_INITIAL_MATRIX) {
530     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
531   } else if (reuse == MAT_REUSE_MATRIX) {
532     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
533   }
534   A = *newmat;
535 
536   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
537   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
538 
539   a = (Mat_MPIAIJ*)A->data;
540   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
541   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
542   if (a->lvec) {
543     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
544   }
545 
546   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
547     a->spptr = new Mat_MPIAIJCUSPARSE;
548 
549     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
550     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
551     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
552     cusparseStruct->coo_p               = NULL;
553     cusparseStruct->coo_pw              = NULL;
554     cusparseStruct->stream              = 0;
555     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
556     cusparseStruct->deviceMat = NULL;
557   }
558 
559   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
560   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
561   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
562   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
563   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
564   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
565   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
566   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
567 
568   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
569   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
570   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
571   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
572   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
573   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
574   PetscFunctionReturn(0);
575 }
576 
577 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
578 {
579   PetscErrorCode ierr;
580 
581   PetscFunctionBegin;
582   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
583   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
584   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
585   PetscFunctionReturn(0);
586 }
587 
588 /*@
589    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
590    (the default parallel PETSc format).  This matrix will ultimately pushed down
591    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
592    assembly performance the user should preallocate the matrix storage by setting
593    the parameter nz (or the array nnz).  By setting these parameters accurately,
594    performance during matrix assembly can be increased by more than a factor of 50.
595 
596    Collective
597 
598    Input Parameters:
599 +  comm - MPI communicator, set to PETSC_COMM_SELF
600 .  m - number of rows
601 .  n - number of columns
602 .  nz - number of nonzeros per row (same for all rows)
603 -  nnz - array containing the number of nonzeros in the various rows
604          (possibly different for each row) or NULL
605 
606    Output Parameter:
607 .  A - the matrix
608 
609    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
610    MatXXXXSetPreallocation() paradigm instead of this routine directly.
611    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
612 
613    Notes:
614    If nnz is given then nz is ignored
615 
616    The AIJ format (also called the Yale sparse matrix format or
617    compressed row storage), is fully compatible with standard Fortran 77
618    storage.  That is, the stored row and column indices can begin at
619    either one (as in Fortran) or zero.  See the users' manual for details.
620 
621    Specify the preallocated storage with either nz or nnz (not both).
622    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
623    allocation.  For large problems you MUST preallocate memory or you
624    will get TERRIBLE performance, see the users' manual chapter on matrices.
625 
626    By default, this format uses inodes (identical nodes) when possible, to
627    improve numerical efficiency of matrix-vector products and solves. We
628    search for consecutive rows with the same nonzero structure, thereby
629    reusing matrix information to achieve increased efficiency.
630 
631    Level: intermediate
632 
633 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
634 @*/
635 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
636 {
637   PetscErrorCode ierr;
638   PetscMPIInt    size;
639 
640   PetscFunctionBegin;
641   ierr = MatCreate(comm,A);CHKERRQ(ierr);
642   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
643   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
644   if (size > 1) {
645     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
646     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
647   } else {
648     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
649     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
650   }
651   PetscFunctionReturn(0);
652 }
653 
654 /*MC
655    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
656 
657    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
658    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
659    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
660 
661    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
662    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
663    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
664    for communicators controlling multiple processes.  It is recommended that you call both of
665    the above preallocation routines for simplicity.
666 
667    Options Database Keys:
668 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
669 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
670 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
671 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
672 
673   Level: beginner
674 
675  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
676 M
677 M*/
678 
679 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
680 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
681 {
682 #if defined(PETSC_USE_CTABLE)
683   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
684 #else
685   PetscSplitCSRDataStructure **p_d_mat;
686   PetscMPIInt                size,rank;
687   MPI_Comm                   comm;
688   PetscErrorCode             ierr;
689   int                        *ai,*bi,*aj,*bj;
690   PetscScalar                *aa,*ba;
691 
692   PetscFunctionBegin;
693   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
694   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
695   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
696   if (A->factortype == MAT_FACTOR_NONE) {
697     CsrMatrix *matrixA,*matrixB=NULL;
698     if (size == 1) {
699       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
700       p_d_mat = &cusparsestruct->deviceMat;
701       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
702       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
703         matrixA = (CsrMatrix*)matstruct->mat;
704         bi = bj = NULL; ba = NULL;
705       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
706     } else {
707       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
708       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
709       p_d_mat = &spptr->deviceMat;
710       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
711       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
712       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
713       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
714       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
715         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
716         matrixA = (CsrMatrix*)matstructA->mat;
717         matrixB = (CsrMatrix*)matstructB->mat;
718         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
719         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
720         ba = thrust::raw_pointer_cast(matrixB->values->data());
721       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
722     }
723     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
724     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
725     aa = thrust::raw_pointer_cast(matrixA->values->data());
726   } else {
727     *B = NULL;
728     PetscFunctionReturn(0);
729   }
730   // act like MatSetValues because not called on host
731   if (A->assembled) {
732     if (A->was_assembled) {
733       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
734     }
735     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
736   } else {
737     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
738   }
739   if (!*p_d_mat) {
740     cudaError_t                 err;
741     PetscSplitCSRDataStructure  *d_mat, h_mat;
742     Mat_SeqAIJ                  *jaca;
743     PetscInt                    n = A->rmap->n, nnz;
744     // create and copy
745     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
746     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
747     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
748     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
749     if (size == 1) {
750       jaca = (Mat_SeqAIJ*)A->data;
751       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
752       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
753       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
754       h_mat.offdiag.a = NULL;
755     } else {
756       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
757       Mat_SeqAIJ  *jacb;
758       jaca = (Mat_SeqAIJ*)aij->A->data;
759       jacb = (Mat_SeqAIJ*)aij->B->data;
760       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
761       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
762       // create colmap - this is ussually done (lazy) in MatSetValues
763       aij->donotstash = PETSC_TRUE;
764       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
765       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
766       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
767       aij->colmap[A->cmap->N] = -9;
768       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
769       {
770         PetscInt ii;
771         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
772       }
773       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
774       // allocate B copy data
775       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
776       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
777       nnz = jacb->i[n];
778 
779       if (jacb->compressedrow.use) {
780         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
781         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
782       } else h_mat.offdiag.i = bi;
783       h_mat.offdiag.j = bj;
784       h_mat.offdiag.a = ba;
785 
786       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
787       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
788       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
789       h_mat.offdiag.n = n;
790     }
791     // allocate A copy data
792     nnz = jaca->i[n];
793     h_mat.diag.n = n;
794     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
795     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
796     if (jaca->compressedrow.use) {
797       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
798       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
799     } else {
800       h_mat.diag.i = ai;
801     }
802     h_mat.diag.j = aj;
803     h_mat.diag.a = aa;
804     // copy pointers and metdata to device
805     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
806     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
807   } else {
808     *B = *p_d_mat;
809   }
810   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
811   PetscFunctionReturn(0);
812 #endif
813 }
814