xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision b2f36b75b482ff5d6fb826dd2b9be57b9ad8cf37)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <petscsf.h>
10 
11 struct VecCUDAEquals
12 {
13   template <typename Tuple>
14   __host__ __device__
15   void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
22 {
23   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
24   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
25   PetscInt           n = cusp->coo_nd + cusp->coo_no;
26   PetscErrorCode     ierr;
27 
28   PetscFunctionBegin;
29   if (cusp->coo_p && v) {
30     thrust::device_ptr<const PetscScalar> d_v;
31     THRUSTARRAY                           *w = NULL;
32 
33     if (isCudaMem(v)) {
34       d_v = thrust::device_pointer_cast(v);
35     } else {
36       w = new THRUSTARRAY(n);
37       w->assign(v,v+n);
38       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
39       d_v = w->data();
40     }
41 
42     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
43                                                               cusp->coo_pw->begin()));
44     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
45                                                               cusp->coo_pw->end()));
46     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
47     thrust::for_each(zibit,zieit,VecCUDAEquals());
48     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
49     delete w;
50     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
51     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
52   } else {
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
55   }
56   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
57   A->num_ass++;
58   A->assembled        = PETSC_TRUE;
59   A->ass_nonzerostate = A->nonzerostate;
60   A->offloadmask      = PETSC_OFFLOAD_GPU;
61   PetscFunctionReturn(0);
62 }
63 
64 template <typename Tuple>
65 struct IsNotOffDiagT
66 {
67   PetscInt _cstart,_cend;
68 
69   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
70   __host__ __device__
71   inline bool operator()(Tuple t)
72   {
73     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
74   }
75 };
76 
77 struct IsOffDiag
78 {
79   PetscInt _cstart,_cend;
80 
81   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
82   __host__ __device__
83   inline bool operator() (const PetscInt &c)
84   {
85     return c < _cstart || c >= _cend;
86   }
87 };
88 
89 struct GlobToLoc
90 {
91   PetscInt _start;
92 
93   GlobToLoc(PetscInt start) : _start(start) {}
94   __host__ __device__
95   inline PetscInt operator() (const PetscInt &c)
96   {
97     return c - _start;
98   }
99 };
100 
101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
102 {
103   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
104   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
105   PetscErrorCode         ierr;
106   PetscInt               *jj;
107   size_t                 noff = 0;
108   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
109   THRUSTINTARRAY         d_j(n);
110   ISLocalToGlobalMapping l2g;
111   cudaError_t            cerr;
112 
113   PetscFunctionBegin;
114   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
115   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
116   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
117   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
118   ierr = PetscFree(b->garray);CHKERRQ(ierr);
119   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
120   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
122 
123   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
124   d_i.assign(coo_i,coo_i+n);
125   d_j.assign(coo_j,coo_j+n);
126   delete cusp->coo_p;
127   delete cusp->coo_pw;
128   cusp->coo_p = NULL;
129   cusp->coo_pw = NULL;
130   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
131   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
132   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
134     cusp->coo_p = new THRUSTINTARRAY(n);
135     cusp->coo_pw = new THRUSTARRAY(n);
136     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
137     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
138     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
139     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
140     firstoffd = mzipp.get_iterator_tuple().get<1>();
141   }
142   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
143   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
144 
145   /* from global to local */
146   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
147   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
148   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
149 
150   /* copy offdiag column indices to map on the CPU */
151   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
152   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153   auto o_j = d_j.begin();
154   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
156   thrust::sort(thrust::device,o_j,d_j.end());
157   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
158   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
159   noff = thrust::distance(o_j,wit);
160   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
161   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
162   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
163   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
165   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
166   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
167   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
168 
169   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
170   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
171   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
172   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
173   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
174   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
175   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
176   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
177 
178   /* GPU memory, cusparse specific call handles it internally */
179   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
181   ierr = PetscFree(jj);CHKERRQ(ierr);
182 
183   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
184   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
187   /*
188   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
190   */
191   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
192   B->preallocated = PETSC_TRUE;
193   B->nonzerostate++;
194 
195   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
196   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
197   B->offloadmask = PETSC_OFFLOAD_CPU;
198   B->assembled = PETSC_FALSE;
199   B->was_assembled = PETSC_FALSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204 {
205   Mat            Ad,Ao;
206   const PetscInt *cmap;
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
211   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
212   if (glob) {
213     PetscInt cst, i, dn, on, *gidx;
214 
215     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
216     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
217     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
218     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
219     for (i=0; i<dn; i++) gidx[i]    = cst + i;
220     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
227 {
228   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
229   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
230   PetscErrorCode     ierr;
231   PetscInt           i;
232 
233   PetscFunctionBegin;
234   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
235   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
236   if (PetscDefined(USE_DEBUG) && d_nnz) {
237     for (i=0; i<B->rmap->n; i++) {
238       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239     }
240   }
241   if (PetscDefined(USE_DEBUG) && o_nnz) {
242     for (i=0; i<B->rmap->n; i++) {
243       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
244     }
245   }
246 #if defined(PETSC_USE_CTABLE)
247   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
248 #else
249   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
250 #endif
251   ierr = PetscFree(b->garray);CHKERRQ(ierr);
252   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
253   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
254   /* Because the B will have been resized we simply destroy it and create a new one each time */
255   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
256   if (!b->A) {
257     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
258     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
259     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
260   }
261   if (!b->B) {
262     PetscMPIInt size;
263     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
264     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
265     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
266     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
267   }
268   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
269   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
270   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
271   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
272   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
273   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
274   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
276   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
278   /* Let A, B use b's handle with pre-set stream
279   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
281   */
282   B->preallocated = PETSC_TRUE;
283   PetscFunctionReturn(0);
284 }
285 
286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
287 {
288   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
289   PetscErrorCode ierr;
290   PetscInt       nt;
291 
292   PetscFunctionBegin;
293   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
294   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
296   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
297   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
298   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
299   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
300   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
301   PetscFunctionReturn(0);
302 }
303 
304 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
305 {
306   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
307   PetscErrorCode ierr;
308 
309   PetscFunctionBegin;
310   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
311   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
312   PetscFunctionReturn(0);
313 }
314 
315 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
316 {
317   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
318   PetscErrorCode ierr;
319   PetscInt       nt;
320 
321   PetscFunctionBegin;
322   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
323   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
324   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
325   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
326   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
327   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
328   PetscFunctionReturn(0);
329 }
330 
331 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
332 {
333   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
334   PetscErrorCode ierr;
335   PetscInt       nt;
336 
337   PetscFunctionBegin;
338   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
339   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
340   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
341   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
342   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
343   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
348 {
349   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
350   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
351 
352   PetscFunctionBegin;
353   switch (op) {
354   case MAT_CUSPARSE_MULT_DIAG:
355     cusparseStruct->diagGPUMatFormat = format;
356     break;
357   case MAT_CUSPARSE_MULT_OFFDIAG:
358     cusparseStruct->offdiagGPUMatFormat = format;
359     break;
360   case MAT_CUSPARSE_ALL:
361     cusparseStruct->diagGPUMatFormat    = format;
362     cusparseStruct->offdiagGPUMatFormat = format;
363     break;
364   default:
365     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
366   }
367   PetscFunctionReturn(0);
368 }
369 
370 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
371 {
372   MatCUSPARSEStorageFormat format;
373   PetscErrorCode           ierr;
374   PetscBool                flg;
375   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
376   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
377 
378   PetscFunctionBegin;
379   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
380   if (A->factortype==MAT_FACTOR_NONE) {
381     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
382                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
383     if (flg) {
384       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
385     }
386     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
387                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
388     if (flg) {
389       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
390     }
391     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
392                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
393     if (flg) {
394       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
395     }
396   }
397   ierr = PetscOptionsTail();CHKERRQ(ierr);
398   PetscFunctionReturn(0);
399 }
400 
401 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
402 {
403   PetscErrorCode     ierr;
404   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
405   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
406   PetscObjectState   onnz = A->nonzerostate;
407 
408   PetscFunctionBegin;
409   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
410   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
411   if (onnz != A->nonzerostate && cusp->deviceMat) {
412     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
413     cudaError_t                cerr;
414 
415     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
416     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
417     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
418     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
419     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
420     ierr = PetscFree(h_mat);CHKERRQ(ierr);
421     cusp->deviceMat = NULL;
422   }
423   PetscFunctionReturn(0);
424 }
425 
426 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
427 {
428   PetscErrorCode     ierr;
429   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
430   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
431   cusparseStatus_t   stat;
432 
433   PetscFunctionBegin;
434   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
435   if (cusparseStruct->deviceMat) {
436     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
437     cudaError_t                cerr;
438 
439     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
440     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
441     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
442     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
443     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
444     ierr = PetscFree(h_mat);CHKERRQ(ierr);
445   }
446   try {
447     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
448     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
449     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
450     /* We want cusparseStruct to use PetscDefaultCudaStream
451     if (cusparseStruct->stream) {
452       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
453     }
454     */
455     delete cusparseStruct->coo_p;
456     delete cusparseStruct->coo_pw;
457     delete cusparseStruct;
458   } catch(char *ex) {
459     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
460   }
461   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
462   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
463   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
464   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
465   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
466   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
467   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
468   PetscFunctionReturn(0);
469 }
470 
471 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
472 {
473   PetscErrorCode     ierr;
474   Mat_MPIAIJ         *a;
475   Mat_MPIAIJCUSPARSE *cusparseStruct;
476   cusparseStatus_t   stat;
477   Mat                A;
478 
479   PetscFunctionBegin;
480   if (reuse == MAT_INITIAL_MATRIX) {
481     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
482   } else if (reuse == MAT_REUSE_MATRIX) {
483     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
484   }
485   A = *newmat;
486   A->boundtocpu = PETSC_FALSE;
487   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
488   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
489 
490   a = (Mat_MPIAIJ*)A->data;
491   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
492   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
493   if (a->lvec) {
494     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
495   }
496 
497   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
498     a->spptr = new Mat_MPIAIJCUSPARSE;
499 
500     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
501     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
502     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
503     cusparseStruct->coo_p               = NULL;
504     cusparseStruct->coo_pw              = NULL;
505     cusparseStruct->stream              = 0;
506     cusparseStruct->deviceMat           = NULL;
507     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
508   }
509 
510   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
511   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
512   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
513   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
514   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
515   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
516   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
517   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
518 
519   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
520   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
521   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
522   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
523   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
524   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
525 #if defined(PETSC_HAVE_HYPRE)
526   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
527 #endif
528   PetscFunctionReturn(0);
529 }
530 
531 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
532 {
533   PetscErrorCode ierr;
534 
535   PetscFunctionBegin;
536   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
537   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
538   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
539   PetscFunctionReturn(0);
540 }
541 
542 /*@
543    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
544    (the default parallel PETSc format).  This matrix will ultimately pushed down
545    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
546    assembly performance the user should preallocate the matrix storage by setting
547    the parameter nz (or the array nnz).  By setting these parameters accurately,
548    performance during matrix assembly can be increased by more than a factor of 50.
549 
550    Collective
551 
552    Input Parameters:
553 +  comm - MPI communicator, set to PETSC_COMM_SELF
554 .  m - number of rows
555 .  n - number of columns
556 .  nz - number of nonzeros per row (same for all rows)
557 -  nnz - array containing the number of nonzeros in the various rows
558          (possibly different for each row) or NULL
559 
560    Output Parameter:
561 .  A - the matrix
562 
563    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
564    MatXXXXSetPreallocation() paradigm instead of this routine directly.
565    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
566 
567    Notes:
568    If nnz is given then nz is ignored
569 
570    The AIJ format (also called the Yale sparse matrix format or
571    compressed row storage), is fully compatible with standard Fortran 77
572    storage.  That is, the stored row and column indices can begin at
573    either one (as in Fortran) or zero.  See the users' manual for details.
574 
575    Specify the preallocated storage with either nz or nnz (not both).
576    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
577    allocation.  For large problems you MUST preallocate memory or you
578    will get TERRIBLE performance, see the users' manual chapter on matrices.
579 
580    By default, this format uses inodes (identical nodes) when possible, to
581    improve numerical efficiency of matrix-vector products and solves. We
582    search for consecutive rows with the same nonzero structure, thereby
583    reusing matrix information to achieve increased efficiency.
584 
585    Level: intermediate
586 
587 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
588 @*/
589 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
590 {
591   PetscErrorCode ierr;
592   PetscMPIInt    size;
593 
594   PetscFunctionBegin;
595   ierr = MatCreate(comm,A);CHKERRQ(ierr);
596   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
597   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
598   if (size > 1) {
599     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
600     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
601   } else {
602     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
603     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
604   }
605   PetscFunctionReturn(0);
606 }
607 
608 /*MC
609    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
610 
611    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
612    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
613    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
614 
615    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
616    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
617    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
618    for communicators controlling multiple processes.  It is recommended that you call both of
619    the above preallocation routines for simplicity.
620 
621    Options Database Keys:
622 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
623 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
624 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
625 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
626 
627   Level: beginner
628 
629  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
630 M*/
631 
632 /*MC
633    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
634 
635   Level: beginner
636 
637  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
638 M*/
639 
640 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
641 
642 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
643 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
644 {
645   PetscSplitCSRDataStructure d_mat;
646   PetscMPIInt                size;
647   PetscErrorCode             ierr;
648   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
649   PetscScalar                *aa = NULL,*ba = NULL;
650   Mat_SeqAIJ                 *jaca = NULL;
651   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
652   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
653 
654   PetscFunctionBegin;
655   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
656   if (A->factortype != MAT_FACTOR_NONE) {
657     *B = NULL;
658     PetscFunctionReturn(0);
659   }
660   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
661   if (size == 1) {
662     PetscBool isseqaij;
663 
664     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
665     if (isseqaij) {
666       jaca = (Mat_SeqAIJ*)A->data;
667       if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
668       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
669       d_mat = cusparsestructA->deviceMat;
670       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
671     } else {
672       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
673       if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
674       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
675       jaca = (Mat_SeqAIJ*)aij->A->data;
676       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
677       d_mat = spptr->deviceMat;
678       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
679     }
680     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
681       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
682       if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
683       matrixA = (CsrMatrix*)matstruct->mat;
684       bi = NULL;
685       bj = NULL;
686       ba = NULL;
687     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
688   } else {
689     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
690     if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
691     jaca = (Mat_SeqAIJ*)aij->A->data;
692     Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
693     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
694 
695     if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
696     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
697     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
698     if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
699     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
700       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
701       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
702       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
703       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
704       if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
705       if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
706       matrixA = (CsrMatrix*)matstructA->mat;
707       matrixB = (CsrMatrix*)matstructB->mat;
708       if (jacb->compressedrow.use) {
709         if (!cusparsestructB->rowoffsets_gpu) {
710           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
711           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
712         }
713         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
714       } else {
715         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
716       }
717       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
718       ba = thrust::raw_pointer_cast(matrixB->values->data());
719     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
720     d_mat = spptr->deviceMat;
721   }
722   if (jaca->compressedrow.use) {
723     if (!cusparsestructA->rowoffsets_gpu) {
724       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
725       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
726     }
727     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
728   } else {
729     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
730   }
731   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
732   aa = thrust::raw_pointer_cast(matrixA->values->data());
733 
734   if (!d_mat) {
735     cudaError_t                cerr;
736     PetscSplitCSRDataStructure h_mat;
737 
738     // create and populate strucy on host and copy on device
739     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
740     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
741     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
742     if (size > 1) { /* need the colmap array */
743       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
744       int        *colmap;
745       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
746 
747       if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
748 
749       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
750       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
751 
752       h_mat->offdiag.i = bi;
753       h_mat->offdiag.j = bj;
754       h_mat->offdiag.a = ba;
755       h_mat->offdiag.n = A->rmap->n;
756 
757       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr);
758       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
759       ierr = PetscFree(colmap);CHKERRQ(ierr);
760     }
761     h_mat->rstart = A->rmap->rstart;
762     h_mat->rend   = A->rmap->rend;
763     h_mat->cstart = A->cmap->rstart;
764     h_mat->cend   = A->cmap->rend;
765     h_mat->N      = A->cmap->N;
766     h_mat->diag.i = ai;
767     h_mat->diag.j = aj;
768     h_mat->diag.a = aa;
769     h_mat->diag.n = A->rmap->n;
770     h_mat->rank   = PetscGlobalRank;
771     // copy pointers and metadata to device
772     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
773     ierr = PetscFree(h_mat);CHKERRQ(ierr);
774   } else {
775     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
776   }
777   *B = d_mat;
778   A->offloadmask = PETSC_OFFLOAD_GPU;
779   PetscFunctionReturn(0);
780 }
781