xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 0f1503a698da0451f8f2e3e02a165d4e1f526457)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
25 {
26   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
27   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
28   PetscInt           n = cusp->coo_nd + cusp->coo_no;
29   PetscErrorCode     ierr;
30 
31   PetscFunctionBegin;
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               *jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
118   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
119   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
120   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
121   ierr = PetscFree(b->garray);CHKERRQ(ierr);
122   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
123   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
124   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
125 
126   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
127   d_i.assign(coo_i,coo_i+n);
128   d_j.assign(coo_j,coo_j+n);
129   delete cusp->coo_p;
130   delete cusp->coo_pw;
131   cusp->coo_p = NULL;
132   cusp->coo_pw = NULL;
133   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
134   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
135   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
136   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
137     cusp->coo_p = new THRUSTINTARRAY(n);
138     cusp->coo_pw = new THRUSTARRAY(n);
139     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
140     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
141     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
142     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
143     firstoffd = mzipp.get_iterator_tuple().get<1>();
144   }
145   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
146   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
147 
148   /* from global to local */
149   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
150   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
151   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
152 
153   /* copy offdiag column indices to map on the CPU */
154   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
155   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
156   auto o_j = d_j.begin();
157   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
158   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
159   thrust::sort(thrust::device,o_j,d_j.end());
160   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
161   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
162   noff = thrust::distance(o_j,wit);
163   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
164   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
165   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
166   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
167   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
168   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
169   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",n,cusp->coo_no);
170   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
171 
172   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
173   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
174   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
175   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
176   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
177   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
178   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
179   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
180 
181   /* GPU memory, cusparse specific call handles it internally */
182   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
183   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
184   ierr = PetscFree(jj);CHKERRQ(ierr);
185 
186   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
187   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
188   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
190   /*
191   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
192   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
193   */
194   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
195   B->preallocated = PETSC_TRUE;
196   B->nonzerostate++;
197 
198   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
199   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
200   B->offloadmask = PETSC_OFFLOAD_CPU;
201   B->assembled = PETSC_FALSE;
202   B->was_assembled = PETSC_FALSE;
203   PetscFunctionReturn(0);
204 }
205 
206 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
207 {
208   Mat            Ad,Ao;
209   const PetscInt *cmap;
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
214   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
215   if (glob) {
216     PetscInt cst, i, dn, on, *gidx;
217 
218     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
219     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
220     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
221     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
222     for (i=0; i<dn; i++) gidx[i]    = cst + i;
223     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
224     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
225   }
226   PetscFunctionReturn(0);
227 }
228 
229 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
230 {
231   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
232   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
233   PetscErrorCode     ierr;
234   PetscInt           i;
235 
236   PetscFunctionBegin;
237   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
238   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
239   if (PetscDefined(USE_DEBUG) && d_nnz) {
240     for (i=0; i<B->rmap->n; i++) {
241       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
242     }
243   }
244   if (PetscDefined(USE_DEBUG) && o_nnz) {
245     for (i=0; i<B->rmap->n; i++) {
246       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
247     }
248   }
249 #if defined(PETSC_USE_CTABLE)
250   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
251 #else
252   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
253 #endif
254   ierr = PetscFree(b->garray);CHKERRQ(ierr);
255   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
256   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
257   /* Because the B will have been resized we simply destroy it and create a new one each time */
258   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
259   if (!b->A) {
260     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
261     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
262     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
263   }
264   if (!b->B) {
265     PetscMPIInt size;
266     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
267     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
268     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
269     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
270   }
271   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
272   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
273   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
274   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
275   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
276   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
278   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
279   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
281   /* Let A, B use b's handle with pre-set stream
282   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
283   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
284   */
285   B->preallocated = PETSC_TRUE;
286   PetscFunctionReturn(0);
287 }
288 
289 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
290 {
291   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
292   PetscErrorCode ierr;
293   PetscInt       nt;
294 
295   PetscFunctionBegin;
296   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
297   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
298   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
299   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
300   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
301   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
302   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
303   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
304   PetscFunctionReturn(0);
305 }
306 
307 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
308 {
309   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
310   PetscErrorCode ierr;
311 
312   PetscFunctionBegin;
313   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
314   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
319 {
320   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
321   PetscErrorCode ierr;
322   PetscInt       nt;
323 
324   PetscFunctionBegin;
325   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
326   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
327   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
328   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
329   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
330   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
335 {
336   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
337   PetscErrorCode ierr;
338   PetscInt       nt;
339 
340   PetscFunctionBegin;
341   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
342   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
343   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
344   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
345   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
346   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
347   PetscFunctionReturn(0);
348 }
349 
350 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
351 {
352   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
353   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
354 
355   PetscFunctionBegin;
356   switch (op) {
357   case MAT_CUSPARSE_MULT_DIAG:
358     cusparseStruct->diagGPUMatFormat = format;
359     break;
360   case MAT_CUSPARSE_MULT_OFFDIAG:
361     cusparseStruct->offdiagGPUMatFormat = format;
362     break;
363   case MAT_CUSPARSE_ALL:
364     cusparseStruct->diagGPUMatFormat    = format;
365     cusparseStruct->offdiagGPUMatFormat = format;
366     break;
367   default:
368     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
374 {
375   MatCUSPARSEStorageFormat format;
376   PetscErrorCode           ierr;
377   PetscBool                flg;
378   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
379   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
380 
381   PetscFunctionBegin;
382   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
383   if (A->factortype==MAT_FACTOR_NONE) {
384     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
385                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
386     if (flg) {
387       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
388     }
389     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
390                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
391     if (flg) {
392       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
393     }
394     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
395                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
396     if (flg) {
397       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
398     }
399   }
400   ierr = PetscOptionsTail();CHKERRQ(ierr);
401   PetscFunctionReturn(0);
402 }
403 
404 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
405 {
406   PetscErrorCode     ierr;
407   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
408   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
409   PetscObjectState   onnz = A->nonzerostate;
410 
411   PetscFunctionBegin;
412   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
413   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
414   if (onnz != A->nonzerostate && cusp->deviceMat) {
415     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
416     cudaError_t                cerr;
417 
418     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
419     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
420     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
421     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
422     if (h_mat->allocated_indices) {
423       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
424       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
425       if (h_mat->offdiag.j) {
426         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
427         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
428       }
429     }
430     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
431     ierr = PetscFree(h_mat);CHKERRQ(ierr);
432     cusp->deviceMat = NULL;
433   }
434   PetscFunctionReturn(0);
435 }
436 
437 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
438 {
439   PetscErrorCode     ierr;
440   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
441   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
442   cusparseStatus_t   stat;
443 
444   PetscFunctionBegin;
445   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
446   if (cusparseStruct->deviceMat) {
447     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
448     cudaError_t                cerr;
449 
450     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
451     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
452     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
453     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
454     if (h_mat->allocated_indices) {
455       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
456       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
457       if (h_mat->offdiag.j) {
458         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
459         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
460       }
461     }
462     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
463     ierr = PetscFree(h_mat);CHKERRQ(ierr);
464   }
465   try {
466     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
467     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
468     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
469     /* We want cusparseStruct to use PetscDefaultCudaStream
470     if (cusparseStruct->stream) {
471       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
472     }
473     */
474     delete cusparseStruct->coo_p;
475     delete cusparseStruct->coo_pw;
476     delete cusparseStruct;
477   } catch(char *ex) {
478     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
479   }
480   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
481   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
482   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
483   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
484   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
485   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
486   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
487   PetscFunctionReturn(0);
488 }
489 
490 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
491 {
492   PetscErrorCode     ierr;
493   Mat_MPIAIJ         *a;
494   Mat_MPIAIJCUSPARSE *cusparseStruct;
495   cusparseStatus_t   stat;
496   Mat                A;
497 
498   PetscFunctionBegin;
499   if (reuse == MAT_INITIAL_MATRIX) {
500     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
501   } else if (reuse == MAT_REUSE_MATRIX) {
502     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
503   }
504   A = *newmat;
505   A->boundtocpu = PETSC_FALSE;
506   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
507   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
508 
509   a = (Mat_MPIAIJ*)A->data;
510   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
511   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
512   if (a->lvec) {
513     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
514   }
515 
516   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
517     a->spptr = new Mat_MPIAIJCUSPARSE;
518 
519     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
520     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
521     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
522     cusparseStruct->coo_p               = NULL;
523     cusparseStruct->coo_pw              = NULL;
524     cusparseStruct->stream              = 0;
525     cusparseStruct->deviceMat           = NULL;
526     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
527   }
528 
529   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
530   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
531   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
532   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
533   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
534   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
535   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
536   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
537 
538   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
539   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
540   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
541   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
542   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
543   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
544 #if defined(PETSC_HAVE_HYPRE)
545   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
546 #endif
547   PetscFunctionReturn(0);
548 }
549 
550 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
551 {
552   PetscErrorCode ierr;
553 
554   PetscFunctionBegin;
555   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
556   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
557   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
558   PetscFunctionReturn(0);
559 }
560 
561 /*@
562    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
563    (the default parallel PETSc format).  This matrix will ultimately pushed down
564    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
565    assembly performance the user should preallocate the matrix storage by setting
566    the parameter nz (or the array nnz).  By setting these parameters accurately,
567    performance during matrix assembly can be increased by more than a factor of 50.
568 
569    Collective
570 
571    Input Parameters:
572 +  comm - MPI communicator, set to PETSC_COMM_SELF
573 .  m - number of rows
574 .  n - number of columns
575 .  nz - number of nonzeros per row (same for all rows)
576 -  nnz - array containing the number of nonzeros in the various rows
577          (possibly different for each row) or NULL
578 
579    Output Parameter:
580 .  A - the matrix
581 
582    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
583    MatXXXXSetPreallocation() paradigm instead of this routine directly.
584    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
585 
586    Notes:
587    If nnz is given then nz is ignored
588 
589    The AIJ format (also called the Yale sparse matrix format or
590    compressed row storage), is fully compatible with standard Fortran 77
591    storage.  That is, the stored row and column indices can begin at
592    either one (as in Fortran) or zero.  See the users' manual for details.
593 
594    Specify the preallocated storage with either nz or nnz (not both).
595    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
596    allocation.  For large problems you MUST preallocate memory or you
597    will get TERRIBLE performance, see the users' manual chapter on matrices.
598 
599    By default, this format uses inodes (identical nodes) when possible, to
600    improve numerical efficiency of matrix-vector products and solves. We
601    search for consecutive rows with the same nonzero structure, thereby
602    reusing matrix information to achieve increased efficiency.
603 
604    Level: intermediate
605 
606 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
607 @*/
608 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
609 {
610   PetscErrorCode ierr;
611   PetscMPIInt    size;
612 
613   PetscFunctionBegin;
614   ierr = MatCreate(comm,A);CHKERRQ(ierr);
615   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
616   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
617   if (size > 1) {
618     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
619     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
620   } else {
621     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
622     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
623   }
624   PetscFunctionReturn(0);
625 }
626 
627 /*MC
628    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
629 
630    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
631    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
632    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
633 
634    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
635    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
636    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
637    for communicators controlling multiple processes.  It is recommended that you call both of
638    the above preallocation routines for simplicity.
639 
640    Options Database Keys:
641 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
642 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
643 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
644 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
645 
646   Level: beginner
647 
648  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
649 M*/
650 
651 /*MC
652    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
653 
654   Level: beginner
655 
656  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
657 M*/
658 
659 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
660 
661 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
662 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
663 {
664   PetscSplitCSRDataStructure d_mat;
665   PetscMPIInt                size;
666   PetscErrorCode             ierr;
667   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
668   PetscScalar                *aa = NULL,*ba = NULL;
669   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
670   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
671   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
672 
673   PetscFunctionBegin;
674   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
675   if (A->factortype != MAT_FACTOR_NONE) {
676     *B = NULL;
677     PetscFunctionReturn(0);
678   }
679   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
680   // get jaca
681   if (size == 1) {
682     PetscBool isseqaij;
683 
684     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
685     if (isseqaij) {
686       jaca = (Mat_SeqAIJ*)A->data;
687       if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
688       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
689       d_mat = cusparsestructA->deviceMat;
690       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
691     } else {
692       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
693       if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
694       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
695       jaca = (Mat_SeqAIJ*)aij->A->data;
696       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
697       d_mat = spptr->deviceMat;
698       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
699     }
700     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
701       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
702       if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
703       matrixA = (CsrMatrix*)matstruct->mat;
704       bi = NULL;
705       bj = NULL;
706       ba = NULL;
707     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
708   } else {
709     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
710     if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
711     jaca = (Mat_SeqAIJ*)aij->A->data;
712     jacb = (Mat_SeqAIJ*)aij->B->data;
713     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
714 
715     if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
716     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
717     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
718     if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
719     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
720       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
721       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
722       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
723       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
724       if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
725       if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
726       matrixA = (CsrMatrix*)matstructA->mat;
727       matrixB = (CsrMatrix*)matstructB->mat;
728       if (jacb->compressedrow.use) {
729         if (!cusparsestructB->rowoffsets_gpu) {
730           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
731           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
732         }
733         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
734       } else {
735         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
736       }
737       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
738       ba = thrust::raw_pointer_cast(matrixB->values->data());
739     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
740     d_mat = spptr->deviceMat;
741   }
742   if (jaca->compressedrow.use) {
743     if (!cusparsestructA->rowoffsets_gpu) {
744       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
745       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
746     }
747     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
748   } else {
749     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
750   }
751   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
752   aa = thrust::raw_pointer_cast(matrixA->values->data());
753 
754   if (!d_mat) {
755     cudaError_t                cerr;
756     PetscSplitCSRDataStructure h_mat;
757 
758     // create and populate strucy on host and copy on device
759     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
760     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
761     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
762     if (size > 1) { /* need the colmap array */
763       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
764       PetscInt   *colmap;
765       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
766 
767       if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
768 
769       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
770       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
771 #if defined(PETSC_USE_64BIT_INDICES)
772       { // have to make a long version of these
773         int        *h_bi32, *h_bj32;
774         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
775         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
776         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
777         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
778         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
779         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
780 
781         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
782         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
783         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
784         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
785 
786         h_mat->offdiag.i = d_bi64;
787         h_mat->offdiag.j = d_bj64;
788         h_mat->allocated_indices = PETSC_TRUE;
789 
790         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
791       }
792 #else
793       h_mat->offdiag.i = (PetscInt*)bi;
794       h_mat->offdiag.j = (PetscInt*)bj;
795       h_mat->allocated_indices = PETSC_FALSE;
796 #endif
797       h_mat->offdiag.a = ba;
798       h_mat->offdiag.n = A->rmap->n;
799 
800       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
801       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
802       ierr = PetscFree(colmap);CHKERRQ(ierr);
803     }
804     h_mat->rstart = A->rmap->rstart;
805     h_mat->rend   = A->rmap->rend;
806     h_mat->cstart = A->cmap->rstart;
807     h_mat->cend   = A->cmap->rend;
808     h_mat->M      = A->cmap->N;
809 #if defined(PETSC_USE_64BIT_INDICES)
810     {
811       if (sizeof(PetscInt) != 8) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
812       int        *h_ai32, *h_aj32;
813       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
814       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
815       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
816       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
817       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
818       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
819 
820       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
821       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
822       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
823       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
824 
825       h_mat->diag.i = d_ai64;
826       h_mat->diag.j = d_aj64;
827       h_mat->allocated_indices = PETSC_TRUE;
828 
829       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
830     }
831 #else
832     h_mat->diag.i = (PetscInt*)ai;
833     h_mat->diag.j = (PetscInt*)aj;
834     h_mat->allocated_indices = PETSC_FALSE;
835 #endif
836     h_mat->diag.a = aa;
837     h_mat->diag.n = A->rmap->n;
838     h_mat->rank   = PetscGlobalRank;
839     // copy pointers and metadata to device
840     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
841     ierr = PetscFree(h_mat);CHKERRQ(ierr);
842   } else {
843     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
844   }
845   *B = d_mat;
846   A->offloadmask = PETSC_OFFLOAD_GPU;
847   PetscFunctionReturn(0);
848 }
849