xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 5fedec97950c19de564efaecd0f125b1a6cb2b20)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <petscsf.h>
10 
11 struct VecCUDAEquals
12 {
13   template <typename Tuple>
14   __host__ __device__
15   void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
22 {
23   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
24   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
25   PetscInt           n = cusp->coo_nd + cusp->coo_no;
26   PetscErrorCode     ierr;
27 
28   PetscFunctionBegin;
29   if (cusp->coo_p && v) {
30     thrust::device_ptr<const PetscScalar> d_v;
31     THRUSTARRAY                           *w = NULL;
32 
33     if (isCudaMem(v)) {
34       d_v = thrust::device_pointer_cast(v);
35     } else {
36       w = new THRUSTARRAY(n);
37       w->assign(v,v+n);
38       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
39       d_v = w->data();
40     }
41 
42     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
43                                                               cusp->coo_pw->begin()));
44     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
45                                                               cusp->coo_pw->end()));
46     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
47     thrust::for_each(zibit,zieit,VecCUDAEquals());
48     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
49     delete w;
50     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
51     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
52   } else {
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
55   }
56   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
57   A->num_ass++;
58   A->assembled        = PETSC_TRUE;
59   A->ass_nonzerostate = A->nonzerostate;
60   A->offloadmask      = PETSC_OFFLOAD_GPU;
61   PetscFunctionReturn(0);
62 }
63 
64 template <typename Tuple>
65 struct IsNotOffDiagT
66 {
67   PetscInt _cstart,_cend;
68 
69   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
70   __host__ __device__
71   inline bool operator()(Tuple t)
72   {
73     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
74   }
75 };
76 
77 struct IsOffDiag
78 {
79   PetscInt _cstart,_cend;
80 
81   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
82   __host__ __device__
83   inline bool operator() (const PetscInt &c)
84   {
85     return c < _cstart || c >= _cend;
86   }
87 };
88 
89 struct GlobToLoc
90 {
91   PetscInt _start;
92 
93   GlobToLoc(PetscInt start) : _start(start) {}
94   __host__ __device__
95   inline PetscInt operator() (const PetscInt &c)
96   {
97     return c - _start;
98   }
99 };
100 
101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
102 {
103   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
104   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
105   PetscErrorCode         ierr;
106   PetscInt               *jj;
107   size_t                 noff = 0;
108   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
109   THRUSTINTARRAY         d_j(n);
110   ISLocalToGlobalMapping l2g;
111   cudaError_t            cerr;
112 
113   PetscFunctionBegin;
114   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
115   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
116   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
117   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
118   ierr = PetscFree(b->garray);CHKERRQ(ierr);
119   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
120   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
122 
123   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
124   d_i.assign(coo_i,coo_i+n);
125   d_j.assign(coo_j,coo_j+n);
126   delete cusp->coo_p;
127   delete cusp->coo_pw;
128   cusp->coo_p = NULL;
129   cusp->coo_pw = NULL;
130   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
131   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
132   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
134     cusp->coo_p = new THRUSTINTARRAY(n);
135     cusp->coo_pw = new THRUSTARRAY(n);
136     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
137     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
138     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
139     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
140     firstoffd = mzipp.get_iterator_tuple().get<1>();
141   }
142   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
143   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
144 
145   /* from global to local */
146   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
147   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
148   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
149 
150   /* copy offdiag column indices to map on the CPU */
151   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
152   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153   auto o_j = d_j.begin();
154   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
156   thrust::sort(thrust::device,o_j,d_j.end());
157   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
158   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
159   noff = thrust::distance(o_j,wit);
160   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
161   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
162   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
163   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
165   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
166   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
167   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
168 
169   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
170   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
171   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
172   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
173   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
174   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
175   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
176   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
177 
178   /* GPU memory, cusparse specific call handles it internally */
179   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
181   ierr = PetscFree(jj);CHKERRQ(ierr);
182 
183   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
184   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
187   /*
188   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
190   */
191   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
192   B->preallocated = PETSC_TRUE;
193   B->nonzerostate++;
194 
195   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
196   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
197   B->offloadmask = PETSC_OFFLOAD_CPU;
198   B->assembled = PETSC_FALSE;
199   B->was_assembled = PETSC_FALSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204 {
205   Mat            Ad,Ao;
206   const PetscInt *cmap;
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
211   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
212   if (glob) {
213     PetscInt cst, i, dn, on, *gidx;
214 
215     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
216     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
217     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
218     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
219     for (i=0; i<dn; i++) gidx[i]    = cst + i;
220     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
227 {
228   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
229   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
230   PetscErrorCode     ierr;
231   PetscInt           i;
232 
233   PetscFunctionBegin;
234   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
235   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
236   if (PetscDefined(USE_DEBUG) && d_nnz) {
237     for (i=0; i<B->rmap->n; i++) {
238       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239     }
240   }
241   if (PetscDefined(USE_DEBUG) && o_nnz) {
242     for (i=0; i<B->rmap->n; i++) {
243       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
244     }
245   }
246 #if defined(PETSC_USE_CTABLE)
247   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
248 #else
249   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
250 #endif
251   ierr = PetscFree(b->garray);CHKERRQ(ierr);
252   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
253   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
254   /* Because the B will have been resized we simply destroy it and create a new one each time */
255   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
256   if (!b->A) {
257     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
258     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
259     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
260   }
261   if (!b->B) {
262     PetscMPIInt size;
263     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
264     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
265     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
266     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
267   }
268   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
269   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
270   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
271   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
272   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
273   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
274   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
276   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
278   /* Let A, B use b's handle with pre-set stream
279   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
281   */
282   B->preallocated = PETSC_TRUE;
283   PetscFunctionReturn(0);
284 }
285 
286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
287 {
288   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
289   PetscErrorCode ierr;
290   PetscInt       nt;
291 
292   PetscFunctionBegin;
293   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
294   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
296   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
297   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
298   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
299   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
300   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
301   PetscFunctionReturn(0);
302 }
303 
304 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
305 {
306   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
307   PetscErrorCode ierr;
308 
309   PetscFunctionBegin;
310   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
311   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
312   PetscFunctionReturn(0);
313 }
314 
315 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
316 {
317   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
318   PetscErrorCode ierr;
319   PetscInt       nt;
320 
321   PetscFunctionBegin;
322   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
323   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
324   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
325   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
326   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
327   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
328   PetscFunctionReturn(0);
329 }
330 
331 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
332 {
333   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
334   PetscErrorCode ierr;
335   PetscInt       nt;
336 
337   PetscFunctionBegin;
338   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
339   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
340   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
341   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
342   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
343   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
348 {
349   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
350   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
351 
352   PetscFunctionBegin;
353   switch (op) {
354   case MAT_CUSPARSE_MULT_DIAG:
355     cusparseStruct->diagGPUMatFormat = format;
356     break;
357   case MAT_CUSPARSE_MULT_OFFDIAG:
358     cusparseStruct->offdiagGPUMatFormat = format;
359     break;
360   case MAT_CUSPARSE_ALL:
361     cusparseStruct->diagGPUMatFormat    = format;
362     cusparseStruct->offdiagGPUMatFormat = format;
363     break;
364   default:
365     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
366   }
367   PetscFunctionReturn(0);
368 }
369 
370 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
371 {
372   MatCUSPARSEStorageFormat format;
373   PetscErrorCode           ierr;
374   PetscBool                flg;
375   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
376   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
377 
378   PetscFunctionBegin;
379   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
380   if (A->factortype==MAT_FACTOR_NONE) {
381     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
382                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
383     if (flg) {
384       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
385     }
386     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
387                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
388     if (flg) {
389       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
390     }
391     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
392                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
393     if (flg) {
394       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
395     }
396   }
397   ierr = PetscOptionsTail();CHKERRQ(ierr);
398   PetscFunctionReturn(0);
399 }
400 
401 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
402 {
403   PetscErrorCode     ierr;
404   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
405   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
406   PetscObjectState   onnz = A->nonzerostate;
407 
408   PetscFunctionBegin;
409   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
410   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
411   if (onnz != A->nonzerostate && cusp->deviceMat) {
412     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
413     cudaError_t                cerr;
414 
415     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
416     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
417     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
418     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
419     if (h_mat->allocated_indices) {
420       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
421       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
422       if (h_mat->offdiag.j) {
423         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
424         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
425       }
426     }
427     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
428     ierr = PetscFree(h_mat);CHKERRQ(ierr);
429     cusp->deviceMat = NULL;
430   }
431   PetscFunctionReturn(0);
432 }
433 
434 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
435 {
436   PetscErrorCode     ierr;
437   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
438   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
439   cusparseStatus_t   stat;
440 
441   PetscFunctionBegin;
442   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
443   if (cusparseStruct->deviceMat) {
444     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
445     cudaError_t                cerr;
446 
447     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
448     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
449     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
450     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
451     if (h_mat->allocated_indices) {
452       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
453       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
454       if (h_mat->offdiag.j) {
455         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
456         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
457       }
458     }
459     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
460     ierr = PetscFree(h_mat);CHKERRQ(ierr);
461   }
462   try {
463     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
464     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
465     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
466     /* We want cusparseStruct to use PetscDefaultCudaStream
467     if (cusparseStruct->stream) {
468       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
469     }
470     */
471     delete cusparseStruct->coo_p;
472     delete cusparseStruct->coo_pw;
473     delete cusparseStruct;
474   } catch(char *ex) {
475     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
476   }
477   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
478   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
479   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
480   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
481   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
482   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
483   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
487 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
488 {
489   PetscErrorCode     ierr;
490   Mat_MPIAIJ         *a;
491   Mat_MPIAIJCUSPARSE *cusparseStruct;
492   cusparseStatus_t   stat;
493   Mat                A;
494 
495   PetscFunctionBegin;
496   if (reuse == MAT_INITIAL_MATRIX) {
497     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
498   } else if (reuse == MAT_REUSE_MATRIX) {
499     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
500   }
501   A = *newmat;
502   A->boundtocpu = PETSC_FALSE;
503   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
504   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
505 
506   a = (Mat_MPIAIJ*)A->data;
507   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
508   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
509   if (a->lvec) {
510     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
511   }
512 
513   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
514     a->spptr = new Mat_MPIAIJCUSPARSE;
515 
516     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
517     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
518     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
519     cusparseStruct->coo_p               = NULL;
520     cusparseStruct->coo_pw              = NULL;
521     cusparseStruct->stream              = 0;
522     cusparseStruct->deviceMat           = NULL;
523     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
524   }
525 
526   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
527   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
528   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
529   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
530   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
531   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
532   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
533   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
534 
535   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
536   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
537   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
538   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
539   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
540   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
541 #if defined(PETSC_HAVE_HYPRE)
542   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
543 #endif
544   PetscFunctionReturn(0);
545 }
546 
547 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
548 {
549   PetscErrorCode ierr;
550 
551   PetscFunctionBegin;
552   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
553   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
554   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
555   PetscFunctionReturn(0);
556 }
557 
558 /*@
559    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
560    (the default parallel PETSc format).  This matrix will ultimately pushed down
561    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
562    assembly performance the user should preallocate the matrix storage by setting
563    the parameter nz (or the array nnz).  By setting these parameters accurately,
564    performance during matrix assembly can be increased by more than a factor of 50.
565 
566    Collective
567 
568    Input Parameters:
569 +  comm - MPI communicator, set to PETSC_COMM_SELF
570 .  m - number of rows
571 .  n - number of columns
572 .  nz - number of nonzeros per row (same for all rows)
573 -  nnz - array containing the number of nonzeros in the various rows
574          (possibly different for each row) or NULL
575 
576    Output Parameter:
577 .  A - the matrix
578 
579    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
580    MatXXXXSetPreallocation() paradigm instead of this routine directly.
581    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
582 
583    Notes:
584    If nnz is given then nz is ignored
585 
586    The AIJ format (also called the Yale sparse matrix format or
587    compressed row storage), is fully compatible with standard Fortran 77
588    storage.  That is, the stored row and column indices can begin at
589    either one (as in Fortran) or zero.  See the users' manual for details.
590 
591    Specify the preallocated storage with either nz or nnz (not both).
592    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
593    allocation.  For large problems you MUST preallocate memory or you
594    will get TERRIBLE performance, see the users' manual chapter on matrices.
595 
596    By default, this format uses inodes (identical nodes) when possible, to
597    improve numerical efficiency of matrix-vector products and solves. We
598    search for consecutive rows with the same nonzero structure, thereby
599    reusing matrix information to achieve increased efficiency.
600 
601    Level: intermediate
602 
603 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
604 @*/
605 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
606 {
607   PetscErrorCode ierr;
608   PetscMPIInt    size;
609 
610   PetscFunctionBegin;
611   ierr = MatCreate(comm,A);CHKERRQ(ierr);
612   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
613   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
614   if (size > 1) {
615     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
616     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
617   } else {
618     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
619     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
620   }
621   PetscFunctionReturn(0);
622 }
623 
624 /*MC
625    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
626 
627    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
628    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
629    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
630 
631    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
632    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
633    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
634    for communicators controlling multiple processes.  It is recommended that you call both of
635    the above preallocation routines for simplicity.
636 
637    Options Database Keys:
638 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
639 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
640 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
641 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
642 
643   Level: beginner
644 
645  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
646 M*/
647 
648 /*MC
649    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
650 
651   Level: beginner
652 
653  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
654 M*/
655 
656 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
657 
658 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
659 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
660 {
661   PetscSplitCSRDataStructure d_mat;
662   PetscMPIInt                size;
663   PetscErrorCode             ierr;
664   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
665   PetscScalar                *aa = NULL,*ba = NULL;
666   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
667   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
668   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
669 
670   PetscFunctionBegin;
671   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
672   if (A->factortype != MAT_FACTOR_NONE) {
673     *B = NULL;
674     PetscFunctionReturn(0);
675   }
676   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
677   // get jaca
678   if (size == 1) {
679     PetscBool isseqaij;
680 
681     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
682     if (isseqaij) {
683       jaca = (Mat_SeqAIJ*)A->data;
684       if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
685       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
686       d_mat = cusparsestructA->deviceMat;
687       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
688     } else {
689       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
690       if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
691       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
692       jaca = (Mat_SeqAIJ*)aij->A->data;
693       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
694       d_mat = spptr->deviceMat;
695       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
696     }
697     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
698       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
699       if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
700       matrixA = (CsrMatrix*)matstruct->mat;
701       bi = NULL;
702       bj = NULL;
703       ba = NULL;
704     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
705   } else {
706     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
707     if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
708     jaca = (Mat_SeqAIJ*)aij->A->data;
709     jacb = (Mat_SeqAIJ*)aij->B->data;
710     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
711 
712     if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
713     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
714     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
715     if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
716     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
717       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
718       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
719       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
720       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
721       if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
722       if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
723       matrixA = (CsrMatrix*)matstructA->mat;
724       matrixB = (CsrMatrix*)matstructB->mat;
725       if (jacb->compressedrow.use) {
726         if (!cusparsestructB->rowoffsets_gpu) {
727           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
728           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
729         }
730         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
731       } else {
732         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
733       }
734       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
735       ba = thrust::raw_pointer_cast(matrixB->values->data());
736     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
737     d_mat = spptr->deviceMat;
738   }
739   if (jaca->compressedrow.use) {
740     if (!cusparsestructA->rowoffsets_gpu) {
741       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
742       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
743     }
744     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
745   } else {
746     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
747   }
748   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
749   aa = thrust::raw_pointer_cast(matrixA->values->data());
750 
751   if (!d_mat) {
752     cudaError_t                cerr;
753     PetscSplitCSRDataStructure h_mat;
754 
755     // create and populate strucy on host and copy on device
756     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
757     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
758     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
759     if (size > 1) { /* need the colmap array */
760       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
761       PetscInt   *colmap;
762       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
763 
764       if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
765 
766       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
767       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
768 #if defined(PETSC_USE_64BIT_INDICES)
769       { // have to make a long version of these
770         int        *h_bi32, *h_bj32;
771         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
772         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
773         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
774         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
775         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
776         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
777 
778         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
779         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
780         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
781         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
782 
783         h_mat->offdiag.i = d_bi64;
784         h_mat->offdiag.j = d_bj64;
785         h_mat->allocated_indices = PETSC_TRUE;
786 
787         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
788       }
789 #else
790       h_mat->offdiag.i = (PetscInt*)bi;
791       h_mat->offdiag.j = (PetscInt*)bj;
792       h_mat->allocated_indices = PETSC_FALSE;
793 #endif
794       h_mat->offdiag.a = ba;
795       h_mat->offdiag.n = A->rmap->n;
796 
797       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
798       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
799       ierr = PetscFree(colmap);CHKERRQ(ierr);
800     }
801     h_mat->rstart = A->rmap->rstart;
802     h_mat->rend   = A->rmap->rend;
803     h_mat->cstart = A->cmap->rstart;
804     h_mat->cend   = A->cmap->rend;
805     h_mat->M      = A->cmap->N;
806 #if defined(PETSC_USE_64BIT_INDICES)
807     {
808       if (sizeof(PetscInt) != 8) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
809       int        *h_ai32, *h_aj32;
810       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
811       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
812       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
813       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
814       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
815       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
816 
817       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
818       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
819       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
820       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
821 
822       h_mat->diag.i = d_ai64;
823       h_mat->diag.j = d_aj64;
824       h_mat->allocated_indices = PETSC_TRUE;
825 
826       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
827     }
828 #else
829     h_mat->diag.i = (PetscInt*)ai;
830     h_mat->diag.j = (PetscInt*)aj;
831     h_mat->allocated_indices = PETSC_FALSE;
832 #endif
833     h_mat->diag.a = aa;
834     h_mat->diag.n = A->rmap->n;
835     h_mat->rank   = PetscGlobalRank;
836     // copy pointers and metadata to device
837     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
838     ierr = PetscFree(h_mat);CHKERRQ(ierr);
839   } else {
840     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
841   }
842   *B = d_mat;
843   A->offloadmask = PETSC_OFFLOAD_GPU;
844   PetscFunctionReturn(0);
845 }
846