xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2f613bf53f46f9356e00a2ca2bd69453be72fc31)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <petscsf.h>
10 
11 struct VecCUDAEquals
12 {
13   template <typename Tuple>
14   __host__ __device__
15   void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
22 {
23   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
24   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
25   PetscInt           n = cusp->coo_nd + cusp->coo_no;
26   PetscErrorCode     ierr;
27 
28   PetscFunctionBegin;
29   if (cusp->coo_p && v) {
30     thrust::device_ptr<const PetscScalar> d_v;
31     THRUSTARRAY                           *w = NULL;
32 
33     if (isCudaMem(v)) {
34       d_v = thrust::device_pointer_cast(v);
35     } else {
36       w = new THRUSTARRAY(n);
37       w->assign(v,v+n);
38       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
39       d_v = w->data();
40     }
41 
42     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
43                                                               cusp->coo_pw->begin()));
44     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
45                                                               cusp->coo_pw->end()));
46     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
47     thrust::for_each(zibit,zieit,VecCUDAEquals());
48     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
49     delete w;
50     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
51     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
52   } else {
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
55   }
56   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
57   A->num_ass++;
58   A->assembled        = PETSC_TRUE;
59   A->ass_nonzerostate = A->nonzerostate;
60   A->offloadmask      = PETSC_OFFLOAD_GPU;
61   PetscFunctionReturn(0);
62 }
63 
64 template <typename Tuple>
65 struct IsNotOffDiagT
66 {
67   PetscInt _cstart,_cend;
68 
69   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
70   __host__ __device__
71   inline bool operator()(Tuple t)
72   {
73     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
74   }
75 };
76 
77 struct IsOffDiag
78 {
79   PetscInt _cstart,_cend;
80 
81   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
82   __host__ __device__
83   inline bool operator() (const PetscInt &c)
84   {
85     return c < _cstart || c >= _cend;
86   }
87 };
88 
89 struct GlobToLoc
90 {
91   PetscInt _start;
92 
93   GlobToLoc(PetscInt start) : _start(start) {}
94   __host__ __device__
95   inline PetscInt operator() (const PetscInt &c)
96   {
97     return c - _start;
98   }
99 };
100 
101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
102 {
103   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
104   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
105   PetscErrorCode         ierr;
106   PetscInt               *jj;
107   size_t                 noff = 0;
108   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
109   THRUSTINTARRAY         d_j(n);
110   ISLocalToGlobalMapping l2g;
111   cudaError_t            cerr;
112 
113   PetscFunctionBegin;
114   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
115   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
116   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
117   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
118   ierr = PetscFree(b->garray);CHKERRQ(ierr);
119   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
120   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
122 
123   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
124   d_i.assign(coo_i,coo_i+n);
125   d_j.assign(coo_j,coo_j+n);
126   delete cusp->coo_p;
127   delete cusp->coo_pw;
128   cusp->coo_p = NULL;
129   cusp->coo_pw = NULL;
130   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
131   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
132   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
134     cusp->coo_p = new THRUSTINTARRAY(n);
135     cusp->coo_pw = new THRUSTARRAY(n);
136     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
137     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
138     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
139     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
140     firstoffd = mzipp.get_iterator_tuple().get<1>();
141   }
142   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
143   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
144 
145   /* from global to local */
146   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
147   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
148   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
149 
150   /* copy offdiag column indices to map on the CPU */
151   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
152   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153   auto o_j = d_j.begin();
154   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
156   thrust::sort(thrust::device,o_j,d_j.end());
157   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
158   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
159   noff = thrust::distance(o_j,wit);
160   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
161   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
162   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
163   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
165   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
166   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
167   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
168 
169   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
170   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
171   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
172   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
173   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
174   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
175   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
176   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
177 
178   /* GPU memory, cusparse specific call handles it internally */
179   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
181   ierr = PetscFree(jj);CHKERRQ(ierr);
182 
183   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
184   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
187   /*
188   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
190   */
191   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
192   B->preallocated = PETSC_TRUE;
193   B->nonzerostate++;
194 
195   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
196   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
197   B->offloadmask = PETSC_OFFLOAD_CPU;
198   B->assembled = PETSC_FALSE;
199   B->was_assembled = PETSC_FALSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204 {
205   Mat            Ad,Ao;
206   const PetscInt *cmap;
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
211   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
212   if (glob) {
213     PetscInt cst, i, dn, on, *gidx;
214 
215     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
216     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
217     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
218     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
219     for (i=0; i<dn; i++) gidx[i]    = cst + i;
220     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
227 {
228   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
229   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
230   PetscErrorCode     ierr;
231   PetscInt           i;
232 
233   PetscFunctionBegin;
234   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
235   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
236   if (PetscDefined(USE_DEBUG) && d_nnz) {
237     for (i=0; i<B->rmap->n; i++) {
238       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239     }
240   }
241   if (PetscDefined(USE_DEBUG) && o_nnz) {
242     for (i=0; i<B->rmap->n; i++) {
243       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
244     }
245   }
246 #if defined(PETSC_USE_CTABLE)
247   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
248 #else
249   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
250 #endif
251   ierr = PetscFree(b->garray);CHKERRQ(ierr);
252   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
253   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
254   /* Because the B will have been resized we simply destroy it and create a new one each time */
255   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
256   if (!b->A) {
257     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
258     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
259     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
260   }
261   if (!b->B) {
262     PetscMPIInt size;
263     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
264     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
265     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
266     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
267   }
268   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
269   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
270   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
271   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
272   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
273   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
274   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
276   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
278   /* Let A, B use b's handle with pre-set stream
279   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
281   */
282   B->preallocated = PETSC_TRUE;
283   PetscFunctionReturn(0);
284 }
285 
286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
287 {
288   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
289   PetscErrorCode ierr;
290   PetscInt       nt;
291 
292   PetscFunctionBegin;
293   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
294   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
296   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
297   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
298   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
303 {
304   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
305   PetscErrorCode ierr;
306 
307   PetscFunctionBegin;
308   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
309   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
310   PetscFunctionReturn(0);
311 }
312 
313 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
314 {
315   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
316   PetscErrorCode ierr;
317   PetscInt       nt;
318 
319   PetscFunctionBegin;
320   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
321   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
322   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
323   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
324   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
325   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
326   PetscFunctionReturn(0);
327 }
328 
329 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
330 {
331   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
332   PetscErrorCode ierr;
333   PetscInt       nt;
334 
335   PetscFunctionBegin;
336   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
337   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
338   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
339   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
340   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
341   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
342   PetscFunctionReturn(0);
343 }
344 
345 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
346 {
347   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
348   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
349 
350   PetscFunctionBegin;
351   switch (op) {
352   case MAT_CUSPARSE_MULT_DIAG:
353     cusparseStruct->diagGPUMatFormat = format;
354     break;
355   case MAT_CUSPARSE_MULT_OFFDIAG:
356     cusparseStruct->offdiagGPUMatFormat = format;
357     break;
358   case MAT_CUSPARSE_ALL:
359     cusparseStruct->diagGPUMatFormat    = format;
360     cusparseStruct->offdiagGPUMatFormat = format;
361     break;
362   default:
363     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
364   }
365   PetscFunctionReturn(0);
366 }
367 
368 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
369 {
370   MatCUSPARSEStorageFormat format;
371   PetscErrorCode           ierr;
372   PetscBool                flg;
373   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
374   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
375 
376   PetscFunctionBegin;
377   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
378   if (A->factortype==MAT_FACTOR_NONE) {
379     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
380                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
381     if (flg) {
382       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
383     }
384     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
385                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
386     if (flg) {
387       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
388     }
389     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
390                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
391     if (flg) {
392       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
393     }
394   }
395   ierr = PetscOptionsTail();CHKERRQ(ierr);
396   PetscFunctionReturn(0);
397 }
398 
399 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
400 {
401   PetscErrorCode     ierr;
402   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
403   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
404   PetscObjectState   onnz = A->nonzerostate;
405 
406   PetscFunctionBegin;
407   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
408   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
409   if (onnz != A->nonzerostate && cusp->deviceMat) {
410     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
411     cudaError_t                cerr;
412 
413     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
414     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
415     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
416     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
417     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
418     ierr = PetscFree(h_mat);CHKERRQ(ierr);
419     cusp->deviceMat = NULL;
420   }
421   PetscFunctionReturn(0);
422 }
423 
424 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
425 {
426   PetscErrorCode     ierr;
427   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
428   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
429   cusparseStatus_t   stat;
430 
431   PetscFunctionBegin;
432   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
433   if (cusparseStruct->deviceMat) {
434     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
435     cudaError_t                cerr;
436 
437     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
438     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
439     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
440     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
441     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
442     ierr = PetscFree(h_mat);CHKERRQ(ierr);
443   }
444   try {
445     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
446     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
447     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
448     /* We want cusparseStruct to use PetscDefaultCudaStream
449     if (cusparseStruct->stream) {
450       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
451     }
452     */
453     delete cusparseStruct->coo_p;
454     delete cusparseStruct->coo_pw;
455     delete cusparseStruct;
456   } catch(char *ex) {
457     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
458   }
459   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
460   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
461   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
462   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
463   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
464   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
465   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
466   PetscFunctionReturn(0);
467 }
468 
469 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
470 {
471   PetscErrorCode     ierr;
472   Mat_MPIAIJ         *a;
473   Mat_MPIAIJCUSPARSE *cusparseStruct;
474   cusparseStatus_t   stat;
475   Mat                A;
476 
477   PetscFunctionBegin;
478   if (reuse == MAT_INITIAL_MATRIX) {
479     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
480   } else if (reuse == MAT_REUSE_MATRIX) {
481     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
482   }
483   A = *newmat;
484   A->boundtocpu = PETSC_FALSE;
485   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
486   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
487 
488   a = (Mat_MPIAIJ*)A->data;
489   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
490   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
491   if (a->lvec) {
492     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
493   }
494 
495   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
496     a->spptr = new Mat_MPIAIJCUSPARSE;
497 
498     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
499     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
500     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
501     cusparseStruct->coo_p               = NULL;
502     cusparseStruct->coo_pw              = NULL;
503     cusparseStruct->stream              = 0;
504     cusparseStruct->deviceMat           = NULL;
505     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
506   }
507 
508   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
509   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
510   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
511   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
512   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
513   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
514   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
515   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
516 
517   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
518   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
519   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
520   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
521   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
522   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
523 #if defined(PETSC_HAVE_HYPRE)
524   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
525 #endif
526   PetscFunctionReturn(0);
527 }
528 
529 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
530 {
531   PetscErrorCode ierr;
532 
533   PetscFunctionBegin;
534   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
535   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
536   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
540 /*@
541    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
542    (the default parallel PETSc format).  This matrix will ultimately pushed down
543    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
544    assembly performance the user should preallocate the matrix storage by setting
545    the parameter nz (or the array nnz).  By setting these parameters accurately,
546    performance during matrix assembly can be increased by more than a factor of 50.
547 
548    Collective
549 
550    Input Parameters:
551 +  comm - MPI communicator, set to PETSC_COMM_SELF
552 .  m - number of rows
553 .  n - number of columns
554 .  nz - number of nonzeros per row (same for all rows)
555 -  nnz - array containing the number of nonzeros in the various rows
556          (possibly different for each row) or NULL
557 
558    Output Parameter:
559 .  A - the matrix
560 
561    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
562    MatXXXXSetPreallocation() paradigm instead of this routine directly.
563    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
564 
565    Notes:
566    If nnz is given then nz is ignored
567 
568    The AIJ format (also called the Yale sparse matrix format or
569    compressed row storage), is fully compatible with standard Fortran 77
570    storage.  That is, the stored row and column indices can begin at
571    either one (as in Fortran) or zero.  See the users' manual for details.
572 
573    Specify the preallocated storage with either nz or nnz (not both).
574    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
575    allocation.  For large problems you MUST preallocate memory or you
576    will get TERRIBLE performance, see the users' manual chapter on matrices.
577 
578    By default, this format uses inodes (identical nodes) when possible, to
579    improve numerical efficiency of matrix-vector products and solves. We
580    search for consecutive rows with the same nonzero structure, thereby
581    reusing matrix information to achieve increased efficiency.
582 
583    Level: intermediate
584 
585 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
586 @*/
587 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
588 {
589   PetscErrorCode ierr;
590   PetscMPIInt    size;
591 
592   PetscFunctionBegin;
593   ierr = MatCreate(comm,A);CHKERRQ(ierr);
594   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
595   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
596   if (size > 1) {
597     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
598     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
599   } else {
600     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
601     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
602   }
603   PetscFunctionReturn(0);
604 }
605 
606 /*MC
607    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
608 
609    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
610    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
611    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
612 
613    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
614    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
615    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
616    for communicators controlling multiple processes.  It is recommended that you call both of
617    the above preallocation routines for simplicity.
618 
619    Options Database Keys:
620 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
621 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
622 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
623 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
624 
625   Level: beginner
626 
627  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
628 M*/
629 
630 /*MC
631    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
632 
633   Level: beginner
634 
635  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
636 M*/
637 
638 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
639 
640 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
641 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
642 {
643   PetscSplitCSRDataStructure d_mat;
644   PetscMPIInt                size;
645   PetscErrorCode             ierr;
646   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
647   PetscScalar                *aa = NULL,*ba = NULL;
648   Mat_SeqAIJ                 *jaca = NULL;
649   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
650   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
651 
652   PetscFunctionBegin;
653   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
654   if (A->factortype != MAT_FACTOR_NONE) {
655     *B = NULL;
656     PetscFunctionReturn(0);
657   }
658   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
659   if (size == 1) {
660     PetscBool isseqaij;
661 
662     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
663     if (isseqaij) {
664       jaca = (Mat_SeqAIJ*)A->data;
665       if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
666       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
667       d_mat = cusparsestructA->deviceMat;
668       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
669     } else {
670       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
671       if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
672       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
673       jaca = (Mat_SeqAIJ*)aij->A->data;
674       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
675       d_mat = spptr->deviceMat;
676       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
677     }
678     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
679       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
680       if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
681       matrixA = (CsrMatrix*)matstruct->mat;
682       bi = NULL;
683       bj = NULL;
684       ba = NULL;
685     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
686   } else {
687     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
688     if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
689     jaca = (Mat_SeqAIJ*)aij->A->data;
690     Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
691     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
692 
693     if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
694     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
695     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
696     if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
697     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
698       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
699       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
700       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
701       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
702       if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
703       if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
704       matrixA = (CsrMatrix*)matstructA->mat;
705       matrixB = (CsrMatrix*)matstructB->mat;
706       if (jacb->compressedrow.use) {
707         if (!cusparsestructB->rowoffsets_gpu) {
708           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
709           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
710         }
711         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
712       } else {
713         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
714       }
715       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
716       ba = thrust::raw_pointer_cast(matrixB->values->data());
717     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
718     d_mat = spptr->deviceMat;
719   }
720   if (jaca->compressedrow.use) {
721     if (!cusparsestructA->rowoffsets_gpu) {
722       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
723       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
724     }
725     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
726   } else {
727     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
728   }
729   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
730   aa = thrust::raw_pointer_cast(matrixA->values->data());
731 
732   if (!d_mat) {
733     cudaError_t                cerr;
734     PetscSplitCSRDataStructure h_mat;
735 
736     // create and populate strucy on host and copy on device
737     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
738     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
739     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
740     if (size > 1) { /* need the colmap array */
741       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
742       int        *colmap;
743       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
744 
745       if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
746 
747       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
748       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
749 
750       h_mat->offdiag.i = bi;
751       h_mat->offdiag.j = bj;
752       h_mat->offdiag.a = ba;
753       h_mat->offdiag.n = A->rmap->n;
754 
755       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr);
756       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
757       ierr = PetscFree(colmap);CHKERRQ(ierr);
758     }
759     h_mat->rstart = A->rmap->rstart;
760     h_mat->rend   = A->rmap->rend;
761     h_mat->cstart = A->cmap->rstart;
762     h_mat->cend   = A->cmap->rend;
763     h_mat->N      = A->cmap->N;
764     h_mat->diag.i = ai;
765     h_mat->diag.j = aj;
766     h_mat->diag.a = aa;
767     h_mat->diag.n = A->rmap->n;
768     h_mat->rank   = PetscGlobalRank;
769     // copy pointers and metadata to device
770     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
771     ierr = PetscFree(h_mat);CHKERRQ(ierr);
772   } else {
773     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
774   }
775   *B = d_mat;
776   A->offloadmask = PETSC_OFFLOAD_GPU;
777   PetscFunctionReturn(0);
778 }
779