xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2c71b3e237ead271e4f3aa1505f92bf476e3413d)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
25 {
26   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
27   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
28   PetscInt           n = cusp->coo_nd + cusp->coo_no;
29   PetscErrorCode     ierr;
30 
31   PetscFunctionBegin;
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               N,*jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
118   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
119   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
120   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
121   ierr = PetscFree(b->garray);CHKERRQ(ierr);
122   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
123   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
124   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
125 
126   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
127   d_i.assign(coo_i,coo_i+n);
128   d_j.assign(coo_j,coo_j+n);
129   delete cusp->coo_p;
130   delete cusp->coo_pw;
131   cusp->coo_p = NULL;
132   cusp->coo_pw = NULL;
133   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
134   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
135   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
136   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
137     cusp->coo_p = new THRUSTINTARRAY(n);
138     cusp->coo_pw = new THRUSTARRAY(n);
139     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
140     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
141     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
142     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
143     firstoffd = mzipp.get_iterator_tuple().get<1>();
144   }
145   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
146   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
147 
148   /* from global to local */
149   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
150   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
151   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
152 
153   /* copy offdiag column indices to map on the CPU */
154   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
155   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
156   auto o_j = d_j.begin();
157   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
158   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
159   thrust::sort(thrust::device,o_j,d_j.end());
160   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
161   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
162   noff = thrust::distance(o_j,wit);
163   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
164   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
165   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
166   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
167   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
168   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
169   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
170   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
171 
172   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
173   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
174   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
175   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
176   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
177   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
178   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
179   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
180 
181   /* GPU memory, cusparse specific call handles it internally */
182   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
183   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
184   ierr = PetscFree(jj);CHKERRQ(ierr);
185 
186   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
187   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
188   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
190   /*
191   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
192   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
193   */
194   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
195   B->preallocated = PETSC_TRUE;
196   B->nonzerostate++;
197 
198   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
199   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
200   B->offloadmask = PETSC_OFFLOAD_CPU;
201   B->assembled = PETSC_FALSE;
202   B->was_assembled = PETSC_FALSE;
203   PetscFunctionReturn(0);
204 }
205 
206 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
207 {
208   Mat            Ad,Ao;
209   const PetscInt *cmap;
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
214   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
215   if (glob) {
216     PetscInt cst, i, dn, on, *gidx;
217 
218     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
219     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
220     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
221     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
222     for (i=0; i<dn; i++) gidx[i]    = cst + i;
223     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
224     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
225   }
226   PetscFunctionReturn(0);
227 }
228 
229 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
230 {
231   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
232   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
233   PetscErrorCode     ierr;
234   PetscInt           i;
235 
236   PetscFunctionBegin;
237   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
238   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
239   if (PetscDefined(USE_DEBUG) && d_nnz) {
240     for (i=0; i<B->rmap->n; i++) {
241       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
242     }
243   }
244   if (PetscDefined(USE_DEBUG) && o_nnz) {
245     for (i=0; i<B->rmap->n; i++) {
246       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
247     }
248   }
249 #if defined(PETSC_USE_CTABLE)
250   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
251 #else
252   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
253 #endif
254   ierr = PetscFree(b->garray);CHKERRQ(ierr);
255   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
256   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
257   /* Because the B will have been resized we simply destroy it and create a new one each time */
258   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
259   if (!b->A) {
260     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
261     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
262     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
263   }
264   if (!b->B) {
265     PetscMPIInt size;
266     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
267     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
268     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
269     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
270   }
271   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
272   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
273   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
274   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
275   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
276   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
278   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
279   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
281   /* Let A, B use b's handle with pre-set stream
282   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
283   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
284   */
285   B->preallocated = PETSC_TRUE;
286   PetscFunctionReturn(0);
287 }
288 
289 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
290 {
291   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
292   PetscErrorCode ierr;
293   PetscInt       nt;
294 
295   PetscFunctionBegin;
296   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
297   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
298   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
299   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
300   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
301   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
302   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
303   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
304   PetscFunctionReturn(0);
305 }
306 
307 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
308 {
309   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
310   PetscErrorCode ierr;
311 
312   PetscFunctionBegin;
313   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
314   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
319 {
320   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
321   PetscErrorCode ierr;
322   PetscInt       nt;
323 
324   PetscFunctionBegin;
325   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
326   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
327   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
328   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
329   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
330   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
335 {
336   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
337   PetscErrorCode ierr;
338   PetscInt       nt;
339 
340   PetscFunctionBegin;
341   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
342   PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
343   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
344   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
345   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
346   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
347   PetscFunctionReturn(0);
348 }
349 
350 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
351 {
352   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
353   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
354 
355   PetscFunctionBegin;
356   switch (op) {
357   case MAT_CUSPARSE_MULT_DIAG:
358     cusparseStruct->diagGPUMatFormat = format;
359     break;
360   case MAT_CUSPARSE_MULT_OFFDIAG:
361     cusparseStruct->offdiagGPUMatFormat = format;
362     break;
363   case MAT_CUSPARSE_ALL:
364     cusparseStruct->diagGPUMatFormat    = format;
365     cusparseStruct->offdiagGPUMatFormat = format;
366     break;
367   default:
368     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
374 {
375   MatCUSPARSEStorageFormat format;
376   PetscErrorCode           ierr;
377   PetscBool                flg;
378   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
379   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
380 
381   PetscFunctionBegin;
382   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
383   if (A->factortype==MAT_FACTOR_NONE) {
384     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
385                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
386     if (flg) {
387       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
388     }
389     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
390                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
391     if (flg) {
392       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
393     }
394     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
395                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
396     if (flg) {
397       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
398     }
399   }
400   ierr = PetscOptionsTail();CHKERRQ(ierr);
401   PetscFunctionReturn(0);
402 }
403 
404 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
405 {
406   PetscErrorCode     ierr;
407   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
408   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
409   PetscObjectState   onnz = A->nonzerostate;
410 
411   PetscFunctionBegin;
412   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
413   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
414   if (onnz != A->nonzerostate && cusp->deviceMat) {
415     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
416     cudaError_t                cerr;
417 
418     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
419     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
420     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
421     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
422     if (h_mat->allocated_indices) {
423       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
424       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
425       if (h_mat->offdiag.j) {
426         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
427         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
428       }
429     }
430     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
431     ierr = PetscFree(h_mat);CHKERRQ(ierr);
432     cusp->deviceMat = NULL;
433   }
434   PetscFunctionReturn(0);
435 }
436 
437 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
438 {
439   PetscErrorCode     ierr;
440   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
441   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
442   cusparseStatus_t   stat;
443 
444   PetscFunctionBegin;
445   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
446   if (cusparseStruct->deviceMat) {
447     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
448     cudaError_t                cerr;
449 
450     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
451     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
452     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
453     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
454     if (h_mat->allocated_indices) {
455       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
456       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
457       if (h_mat->offdiag.j) {
458         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
459         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
460       }
461     }
462     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
463     ierr = PetscFree(h_mat);CHKERRQ(ierr);
464   }
465   try {
466     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
467     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
468     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
469     /* We want cusparseStruct to use PetscDefaultCudaStream
470     if (cusparseStruct->stream) {
471       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
472     }
473     */
474     delete cusparseStruct->coo_p;
475     delete cusparseStruct->coo_pw;
476     delete cusparseStruct;
477   } catch(char *ex) {
478     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
479   }
480   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
481   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
482   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
483   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
484   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
485   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
486   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
487   PetscFunctionReturn(0);
488 }
489 
490 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
491 {
492   PetscErrorCode     ierr;
493   Mat_MPIAIJ         *a;
494   Mat_MPIAIJCUSPARSE *cusparseStruct;
495   cusparseStatus_t   stat;
496   Mat                A;
497 
498   PetscFunctionBegin;
499   if (reuse == MAT_INITIAL_MATRIX) {
500     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
501   } else if (reuse == MAT_REUSE_MATRIX) {
502     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
503   }
504   A = *newmat;
505   A->boundtocpu = PETSC_FALSE;
506   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
507   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
508 
509   a = (Mat_MPIAIJ*)A->data;
510   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
511   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
512   if (a->lvec) {
513     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
514   }
515 
516   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
517     a->spptr = new Mat_MPIAIJCUSPARSE;
518 
519     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
520     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
521     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
522     cusparseStruct->coo_p               = NULL;
523     cusparseStruct->coo_pw              = NULL;
524     cusparseStruct->stream              = 0;
525     cusparseStruct->deviceMat           = NULL;
526     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
527   }
528 
529   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
530   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
531   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
532   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
533   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
534   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
535   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
536   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
537 
538   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
539   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
540   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
541   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
542   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
543   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
544 #if defined(PETSC_HAVE_HYPRE)
545   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
546 #endif
547   PetscFunctionReturn(0);
548 }
549 
550 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
551 {
552   PetscErrorCode ierr;
553 
554   PetscFunctionBegin;
555   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
556   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
557   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
558   PetscFunctionReturn(0);
559 }
560 
561 /*@
562    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
563    (the default parallel PETSc format).  This matrix will ultimately pushed down
564    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
565    assembly performance the user should preallocate the matrix storage by setting
566    the parameter nz (or the array nnz).  By setting these parameters accurately,
567    performance during matrix assembly can be increased by more than a factor of 50.
568 
569    Collective
570 
571    Input Parameters:
572 +  comm - MPI communicator, set to PETSC_COMM_SELF
573 .  m - number of rows
574 .  n - number of columns
575 .  nz - number of nonzeros per row (same for all rows)
576 -  nnz - array containing the number of nonzeros in the various rows
577          (possibly different for each row) or NULL
578 
579    Output Parameter:
580 .  A - the matrix
581 
582    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
583    MatXXXXSetPreallocation() paradigm instead of this routine directly.
584    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
585 
586    Notes:
587    If nnz is given then nz is ignored
588 
589    The AIJ format (also called the Yale sparse matrix format or
590    compressed row storage), is fully compatible with standard Fortran 77
591    storage.  That is, the stored row and column indices can begin at
592    either one (as in Fortran) or zero.  See the users' manual for details.
593 
594    Specify the preallocated storage with either nz or nnz (not both).
595    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
596    allocation.  For large problems you MUST preallocate memory or you
597    will get TERRIBLE performance, see the users' manual chapter on matrices.
598 
599    By default, this format uses inodes (identical nodes) when possible, to
600    improve numerical efficiency of matrix-vector products and solves. We
601    search for consecutive rows with the same nonzero structure, thereby
602    reusing matrix information to achieve increased efficiency.
603 
604    Level: intermediate
605 
606 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
607 @*/
608 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
609 {
610   PetscErrorCode ierr;
611   PetscMPIInt    size;
612 
613   PetscFunctionBegin;
614   ierr = MatCreate(comm,A);CHKERRQ(ierr);
615   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
616   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
617   if (size > 1) {
618     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
619     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
620   } else {
621     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
622     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
623   }
624   PetscFunctionReturn(0);
625 }
626 
627 /*MC
628    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
629 
630    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
631    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
632    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
633 
634    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
635    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
636    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
637    for communicators controlling multiple processes.  It is recommended that you call both of
638    the above preallocation routines for simplicity.
639 
640    Options Database Keys:
641 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
642 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
643 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
644 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
645 
646   Level: beginner
647 
648  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
649 M*/
650 
651 /*MC
652    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
653 
654   Level: beginner
655 
656  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
657 M*/
658 
659 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
660 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
661 {
662   PetscSplitCSRDataStructure d_mat;
663   PetscMPIInt                size;
664   PetscErrorCode             ierr;
665   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
666   PetscScalar                *aa = NULL,*ba = NULL;
667   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
668   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
669   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
670 
671   PetscFunctionBegin;
672   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
673   if (A->factortype != MAT_FACTOR_NONE) {
674     *B = NULL;
675     PetscFunctionReturn(0);
676   }
677   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
678   // get jaca
679   if (size == 1) {
680     PetscBool isseqaij;
681 
682     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
683     if (isseqaij) {
684       jaca = (Mat_SeqAIJ*)A->data;
685       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
686       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
687       d_mat = cusparsestructA->deviceMat;
688       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
689     } else {
690       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
691       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
692       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
693       jaca = (Mat_SeqAIJ*)aij->A->data;
694       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
695       d_mat = spptr->deviceMat;
696       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
697     }
698     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
699       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
700       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
701       matrixA = (CsrMatrix*)matstruct->mat;
702       bi = NULL;
703       bj = NULL;
704       ba = NULL;
705     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
706   } else {
707     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
708     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
709     jaca = (Mat_SeqAIJ*)aij->A->data;
710     jacb = (Mat_SeqAIJ*)aij->B->data;
711     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
712 
713     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
714     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
715     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
716     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
717     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
718       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
719       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
720       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
721       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
722       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
723       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
724       matrixA = (CsrMatrix*)matstructA->mat;
725       matrixB = (CsrMatrix*)matstructB->mat;
726       if (jacb->compressedrow.use) {
727         if (!cusparsestructB->rowoffsets_gpu) {
728           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
729           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
730         }
731         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
732       } else {
733         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
734       }
735       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
736       ba = thrust::raw_pointer_cast(matrixB->values->data());
737     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
738     d_mat = spptr->deviceMat;
739   }
740   if (jaca->compressedrow.use) {
741     if (!cusparsestructA->rowoffsets_gpu) {
742       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
743       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
744     }
745     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
746   } else {
747     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
748   }
749   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
750   aa = thrust::raw_pointer_cast(matrixA->values->data());
751 
752   if (!d_mat) {
753     cudaError_t                cerr;
754     PetscSplitCSRDataStructure h_mat;
755 
756     // create and populate strucy on host and copy on device
757     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
758     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
759     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
760     if (size > 1) { /* need the colmap array */
761       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
762       PetscInt   *colmap;
763       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
764 
765       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
766 
767       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
768       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
769 #if defined(PETSC_USE_64BIT_INDICES)
770       { // have to make a long version of these
771         int        *h_bi32, *h_bj32;
772         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
773         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
774         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
775         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
776         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
777         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
778 
779         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
780         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
781         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
782         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
783 
784         h_mat->offdiag.i = d_bi64;
785         h_mat->offdiag.j = d_bj64;
786         h_mat->allocated_indices = PETSC_TRUE;
787 
788         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
789       }
790 #else
791       h_mat->offdiag.i = (PetscInt*)bi;
792       h_mat->offdiag.j = (PetscInt*)bj;
793       h_mat->allocated_indices = PETSC_FALSE;
794 #endif
795       h_mat->offdiag.a = ba;
796       h_mat->offdiag.n = A->rmap->n;
797 
798       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
799       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
800       ierr = PetscFree(colmap);CHKERRQ(ierr);
801     }
802     h_mat->rstart = A->rmap->rstart;
803     h_mat->rend   = A->rmap->rend;
804     h_mat->cstart = A->cmap->rstart;
805     h_mat->cend   = A->cmap->rend;
806     h_mat->M      = A->cmap->N;
807 #if defined(PETSC_USE_64BIT_INDICES)
808     {
809       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
810       int        *h_ai32, *h_aj32;
811       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
812       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
813       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
814       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
815       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
816       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
817 
818       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
819       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
820       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
821       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
822 
823       h_mat->diag.i = d_ai64;
824       h_mat->diag.j = d_aj64;
825       h_mat->allocated_indices = PETSC_TRUE;
826 
827       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
828     }
829 #else
830     h_mat->diag.i = (PetscInt*)ai;
831     h_mat->diag.j = (PetscInt*)aj;
832     h_mat->allocated_indices = PETSC_FALSE;
833 #endif
834     h_mat->diag.a = aa;
835     h_mat->diag.n = A->rmap->n;
836     h_mat->rank   = PetscGlobalRank;
837     // copy pointers and metadata to device
838     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
839     ierr = PetscFree(h_mat);CHKERRQ(ierr);
840   } else {
841     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
842   }
843   *B = d_mat;
844   A->offloadmask = PETSC_OFFLOAD_GPU;
845   PetscFunctionReturn(0);
846 }
847