xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 030f984af8d8bb4c203755d35bded3c05b3d83ce)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <petscsf.h>
10 
11 struct VecCUDAEquals
12 {
13   template <typename Tuple>
14   __host__ __device__
15   void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
22 {
23   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
24   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
25   PetscInt           n = cusp->coo_nd + cusp->coo_no;
26   PetscErrorCode     ierr;
27 
28   PetscFunctionBegin;
29   if (cusp->coo_p && v) {
30     thrust::device_ptr<const PetscScalar> d_v;
31     THRUSTARRAY                           *w = NULL;
32 
33     if (isCudaMem(v)) {
34       d_v = thrust::device_pointer_cast(v);
35     } else {
36       w = new THRUSTARRAY(n);
37       w->assign(v,v+n);
38       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
39       d_v = w->data();
40     }
41 
42     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
43                                                               cusp->coo_pw->begin()));
44     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
45                                                               cusp->coo_pw->end()));
46     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
47     thrust::for_each(zibit,zieit,VecCUDAEquals());
48     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
49     delete w;
50     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
51     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
52   } else {
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
55   }
56   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
57   A->num_ass++;
58   A->assembled        = PETSC_TRUE;
59   A->ass_nonzerostate = A->nonzerostate;
60   A->offloadmask      = PETSC_OFFLOAD_GPU;
61   PetscFunctionReturn(0);
62 }
63 
64 template <typename Tuple>
65 struct IsNotOffDiagT
66 {
67   PetscInt _cstart,_cend;
68 
69   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
70   __host__ __device__
71   inline bool operator()(Tuple t)
72   {
73     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
74   }
75 };
76 
77 struct IsOffDiag
78 {
79   PetscInt _cstart,_cend;
80 
81   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
82   __host__ __device__
83   inline bool operator() (const PetscInt &c)
84   {
85     return c < _cstart || c >= _cend;
86   }
87 };
88 
89 struct GlobToLoc
90 {
91   PetscInt _start;
92 
93   GlobToLoc(PetscInt start) : _start(start) {}
94   __host__ __device__
95   inline PetscInt operator() (const PetscInt &c)
96   {
97     return c - _start;
98   }
99 };
100 
101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
102 {
103   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
104   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
105   PetscErrorCode         ierr;
106   PetscInt               *jj;
107   size_t                 noff = 0;
108   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
109   THRUSTINTARRAY         d_j(n);
110   ISLocalToGlobalMapping l2g;
111   cudaError_t            cerr;
112 
113   PetscFunctionBegin;
114   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
115   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
116   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
117   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
118   ierr = PetscFree(b->garray);CHKERRQ(ierr);
119   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
120   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
122 
123   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
124   d_i.assign(coo_i,coo_i+n);
125   d_j.assign(coo_j,coo_j+n);
126   delete cusp->coo_p;
127   delete cusp->coo_pw;
128   cusp->coo_p = NULL;
129   cusp->coo_pw = NULL;
130   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
131   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
132   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
134     cusp->coo_p = new THRUSTINTARRAY(n);
135     cusp->coo_pw = new THRUSTARRAY(n);
136     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
137     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
138     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
139     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
140     firstoffd = mzipp.get_iterator_tuple().get<1>();
141   }
142   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
143   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
144 
145   /* from global to local */
146   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
147   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
148   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
149 
150   /* copy offdiag column indices to map on the CPU */
151   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
152   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153   auto o_j = d_j.begin();
154   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
156   thrust::sort(thrust::device,o_j,d_j.end());
157   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
158   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
159   noff = thrust::distance(o_j,wit);
160   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
161   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
162   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
163   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
165   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
166   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
167   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
168 
169   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
170   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
171   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
172   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
173   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
174   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
175   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
176   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
177 
178   /* GPU memory, cusparse specific call handles it internally */
179   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
181   ierr = PetscFree(jj);CHKERRQ(ierr);
182 
183   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
184   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
187   /*
188   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
190   */
191   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
192   B->preallocated = PETSC_TRUE;
193   B->nonzerostate++;
194 
195   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
196   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
197   B->offloadmask = PETSC_OFFLOAD_CPU;
198   B->assembled = PETSC_FALSE;
199   B->was_assembled = PETSC_FALSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204 {
205   Mat            Ad,Ao;
206   const PetscInt *cmap;
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
211   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
212   if (glob) {
213     PetscInt cst, i, dn, on, *gidx;
214 
215     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
216     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
217     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
218     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
219     for (i=0; i<dn; i++) gidx[i]    = cst + i;
220     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
227 {
228   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
229   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
230   PetscErrorCode     ierr;
231   PetscInt           i;
232 
233   PetscFunctionBegin;
234   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
235   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
236   if (PetscDefined(USE_DEBUG) && d_nnz) {
237     for (i=0; i<B->rmap->n; i++) {
238       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239     }
240   }
241   if (PetscDefined(USE_DEBUG) && o_nnz) {
242     for (i=0; i<B->rmap->n; i++) {
243       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
244     }
245   }
246 #if defined(PETSC_USE_CTABLE)
247   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
248 #else
249   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
250 #endif
251   ierr = PetscFree(b->garray);CHKERRQ(ierr);
252   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
253   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
254   /* Because the B will have been resized we simply destroy it and create a new one each time */
255   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
256   if (!b->A) {
257     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
258     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
259     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
260   }
261   if (!b->B) {
262     PetscMPIInt size;
263     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
264     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
265     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
266     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
267   }
268   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
269   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
270   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
271   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
272   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
273   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
274   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
276   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
278   /* Let A, B use b's handle with pre-set stream
279   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
281   */
282   B->preallocated = PETSC_TRUE;
283   PetscFunctionReturn(0);
284 }
285 
286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
287 {
288   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
289   PetscErrorCode ierr;
290   PetscInt       nt;
291 
292   PetscFunctionBegin;
293   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
294   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
296   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
297   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
298   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
303 {
304   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
305   PetscErrorCode ierr;
306 
307   PetscFunctionBegin;
308   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
309   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
310   PetscFunctionReturn(0);
311 }
312 
313 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
314 {
315   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
316   PetscErrorCode ierr;
317   PetscInt       nt;
318 
319   PetscFunctionBegin;
320   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
321   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
322   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
323   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
324   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
325   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
326   PetscFunctionReturn(0);
327 }
328 
329 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
330 {
331   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
332   PetscErrorCode ierr;
333   PetscInt       nt;
334 
335   PetscFunctionBegin;
336   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
337   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
338   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
339   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
340   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
341   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
342   PetscFunctionReturn(0);
343 }
344 
345 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
346 {
347   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
348   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
349 
350   PetscFunctionBegin;
351   switch (op) {
352   case MAT_CUSPARSE_MULT_DIAG:
353     cusparseStruct->diagGPUMatFormat = format;
354     break;
355   case MAT_CUSPARSE_MULT_OFFDIAG:
356     cusparseStruct->offdiagGPUMatFormat = format;
357     break;
358   case MAT_CUSPARSE_ALL:
359     cusparseStruct->diagGPUMatFormat    = format;
360     cusparseStruct->offdiagGPUMatFormat = format;
361     break;
362   default:
363     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
364   }
365   PetscFunctionReturn(0);
366 }
367 
368 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
369 {
370   MatCUSPARSEStorageFormat format;
371   PetscErrorCode           ierr;
372   PetscBool                flg;
373   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
374   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
375 
376   PetscFunctionBegin;
377   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
378   if (A->factortype==MAT_FACTOR_NONE) {
379     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
380                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
381     if (flg) {
382       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
383     }
384     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
385                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
386     if (flg) {
387       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
388     }
389     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
390                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
391     if (flg) {
392       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
393     }
394   }
395   ierr = PetscOptionsTail();CHKERRQ(ierr);
396   PetscFunctionReturn(0);
397 }
398 
399 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
400 {
401   PetscErrorCode     ierr;
402   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
403   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
404   PetscObjectState   onnz = A->nonzerostate;
405 
406   PetscFunctionBegin;
407   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
408   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
409   if (onnz != A->nonzerostate && cusp->deviceMat) {
410     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
411     cudaError_t                cerr;
412 
413     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
414     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
415     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
416     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
417     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
418     ierr = PetscFree(h_mat);CHKERRQ(ierr);
419     cusp->deviceMat = NULL;
420   }
421   PetscFunctionReturn(0);
422 }
423 
424 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
425 {
426   PetscErrorCode     ierr;
427   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
428   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
429   cusparseStatus_t   stat;
430 
431   PetscFunctionBegin;
432   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
433   if (cusparseStruct->deviceMat) {
434     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
435     cudaError_t                cerr;
436 
437     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
438     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
439     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
440     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
441     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
442     ierr = PetscFree(h_mat);CHKERRQ(ierr);
443   }
444   try {
445     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
446     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
447     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
448     /* We want cusparseStruct to use PetscDefaultCudaStream
449     if (cusparseStruct->stream) {
450       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
451     }
452     */
453     delete cusparseStruct->coo_p;
454     delete cusparseStruct->coo_pw;
455     delete cusparseStruct;
456   } catch(char *ex) {
457     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
458   }
459   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
460   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
461   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
462   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
463   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
464   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
465   PetscFunctionReturn(0);
466 }
467 
468 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
469 {
470   PetscErrorCode     ierr;
471   Mat_MPIAIJ         *a;
472   Mat_MPIAIJCUSPARSE *cusparseStruct;
473   cusparseStatus_t   stat;
474   Mat                A;
475 
476   PetscFunctionBegin;
477   if (reuse == MAT_INITIAL_MATRIX) {
478     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
479   } else if (reuse == MAT_REUSE_MATRIX) {
480     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
481   }
482   A = *newmat;
483   A->boundtocpu = PETSC_FALSE;
484   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
485   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
486 
487   a = (Mat_MPIAIJ*)A->data;
488   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
489   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
490   if (a->lvec) {
491     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
492   }
493 
494   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
495     a->spptr = new Mat_MPIAIJCUSPARSE;
496 
497     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
498     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
499     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
500     cusparseStruct->coo_p               = NULL;
501     cusparseStruct->coo_pw              = NULL;
502     cusparseStruct->stream              = 0;
503     cusparseStruct->deviceMat           = NULL;
504     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
505   }
506 
507   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
508   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
509   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
510   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
511   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
512   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
513   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
514   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
515 
516   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
517   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
518   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
519   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
520   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
521   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
522   PetscFunctionReturn(0);
523 }
524 
525 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
526 {
527   PetscErrorCode ierr;
528 
529   PetscFunctionBegin;
530   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
531   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
532   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
533   PetscFunctionReturn(0);
534 }
535 
536 /*@
537    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
538    (the default parallel PETSc format).  This matrix will ultimately pushed down
539    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
540    assembly performance the user should preallocate the matrix storage by setting
541    the parameter nz (or the array nnz).  By setting these parameters accurately,
542    performance during matrix assembly can be increased by more than a factor of 50.
543 
544    Collective
545 
546    Input Parameters:
547 +  comm - MPI communicator, set to PETSC_COMM_SELF
548 .  m - number of rows
549 .  n - number of columns
550 .  nz - number of nonzeros per row (same for all rows)
551 -  nnz - array containing the number of nonzeros in the various rows
552          (possibly different for each row) or NULL
553 
554    Output Parameter:
555 .  A - the matrix
556 
557    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
558    MatXXXXSetPreallocation() paradigm instead of this routine directly.
559    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
560 
561    Notes:
562    If nnz is given then nz is ignored
563 
564    The AIJ format (also called the Yale sparse matrix format or
565    compressed row storage), is fully compatible with standard Fortran 77
566    storage.  That is, the stored row and column indices can begin at
567    either one (as in Fortran) or zero.  See the users' manual for details.
568 
569    Specify the preallocated storage with either nz or nnz (not both).
570    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
571    allocation.  For large problems you MUST preallocate memory or you
572    will get TERRIBLE performance, see the users' manual chapter on matrices.
573 
574    By default, this format uses inodes (identical nodes) when possible, to
575    improve numerical efficiency of matrix-vector products and solves. We
576    search for consecutive rows with the same nonzero structure, thereby
577    reusing matrix information to achieve increased efficiency.
578 
579    Level: intermediate
580 
581 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
582 @*/
583 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
584 {
585   PetscErrorCode ierr;
586   PetscMPIInt    size;
587 
588   PetscFunctionBegin;
589   ierr = MatCreate(comm,A);CHKERRQ(ierr);
590   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
591   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
592   if (size > 1) {
593     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
594     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
595   } else {
596     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
597     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
598   }
599   PetscFunctionReturn(0);
600 }
601 
602 /*MC
603    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
604 
605    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
606    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
607    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
608 
609    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
610    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
611    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
612    for communicators controlling multiple processes.  It is recommended that you call both of
613    the above preallocation routines for simplicity.
614 
615    Options Database Keys:
616 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
617 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
618 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
619 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
620 
621   Level: beginner
622 
623  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
624 M*/
625 
626 /*MC
627    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
628 
629   Level: beginner
630 
631  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
632 M*/
633 
634 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
635 
636 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
637 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
638 {
639   PetscSplitCSRDataStructure d_mat;
640   PetscMPIInt                size;
641   PetscErrorCode             ierr;
642   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
643   PetscScalar                *aa = NULL,*ba = NULL;
644   Mat_SeqAIJ                 *jaca = NULL;
645   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
646   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
647 
648   PetscFunctionBegin;
649   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
650   if (A->factortype != MAT_FACTOR_NONE) {
651     *B = NULL;
652     PetscFunctionReturn(0);
653   }
654   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
655   if (size == 1) {
656     PetscBool isseqaij;
657 
658     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
659     if (isseqaij) {
660       jaca = (Mat_SeqAIJ*)A->data;
661       if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
662       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
663       d_mat = cusparsestructA->deviceMat;
664       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
665     } else {
666       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
667       if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
668       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
669       jaca = (Mat_SeqAIJ*)aij->A->data;
670       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
671       d_mat = spptr->deviceMat;
672       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
673     }
674     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
675       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
676       if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
677       matrixA = (CsrMatrix*)matstruct->mat;
678       bi = NULL;
679       bj = NULL;
680       ba = NULL;
681     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
682   } else {
683     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
684     if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
685     jaca = (Mat_SeqAIJ*)aij->A->data;
686     Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
687     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
688 
689     if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
690     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
691     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
692     if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
693     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
694       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
695       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
696       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
697       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
698       if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
699       if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
700       matrixA = (CsrMatrix*)matstructA->mat;
701       matrixB = (CsrMatrix*)matstructB->mat;
702       if (jacb->compressedrow.use) {
703         if (!cusparsestructB->rowoffsets_gpu) {
704           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
705           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
706         }
707         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
708       } else {
709         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
710       }
711       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
712       ba = thrust::raw_pointer_cast(matrixB->values->data());
713     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
714     d_mat = spptr->deviceMat;
715   }
716   if (jaca->compressedrow.use) {
717     if (!cusparsestructA->rowoffsets_gpu) {
718       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
719       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
720     }
721     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
722   } else {
723     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
724   }
725   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
726   aa = thrust::raw_pointer_cast(matrixA->values->data());
727 
728   if (!d_mat) {
729     cudaError_t                cerr;
730     PetscSplitCSRDataStructure h_mat;
731 
732     // create and populate strucy on host and copy on device
733     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
734     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
735     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
736     if (size > 1) { /* need the colmap array */
737       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
738       int        *colmap;
739       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
740 
741       if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
742 
743       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
744       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
745 
746       h_mat->offdiag.i = bi;
747       h_mat->offdiag.j = bj;
748       h_mat->offdiag.a = ba;
749       h_mat->offdiag.n = A->rmap->n;
750 
751       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr);
752       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
753       ierr = PetscFree(colmap);CHKERRQ(ierr);
754     }
755     h_mat->rstart = A->rmap->rstart;
756     h_mat->rend   = A->rmap->rend;
757     h_mat->cstart = A->cmap->rstart;
758     h_mat->cend   = A->cmap->rend;
759     h_mat->N      = A->cmap->N;
760     h_mat->diag.i = ai;
761     h_mat->diag.j = aj;
762     h_mat->diag.a = aa;
763     h_mat->diag.n = A->rmap->n;
764     h_mat->rank   = PetscGlobalRank;
765     // copy pointers and metadata to device
766     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
767     ierr = PetscFree(h_mat);CHKERRQ(ierr);
768   } else {
769     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
770   }
771   *B = d_mat;
772   A->offloadmask = PETSC_OFFLOAD_GPU;
773   PetscFunctionReturn(0);
774 }
775