xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision b004864fb0d4cce1a1c47bfadb887fe0191b1cea)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
25 {
26   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
27   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
28   PetscInt           n = cusp->coo_nd + cusp->coo_no;
29   PetscErrorCode     ierr;
30 
31   PetscFunctionBegin;
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               N,*jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
118   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
119   ierr = PetscFree(b->garray);CHKERRQ(ierr);
120   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
122   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
123 
124   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
125   d_i.assign(coo_i,coo_i+n);
126   d_j.assign(coo_j,coo_j+n);
127   delete cusp->coo_p;
128   delete cusp->coo_pw;
129   cusp->coo_p = NULL;
130   cusp->coo_pw = NULL;
131   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
132   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
134   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
135     cusp->coo_p = new THRUSTINTARRAY(n);
136     cusp->coo_pw = new THRUSTARRAY(n);
137     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
138     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
139     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
140     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
141     firstoffd = mzipp.get_iterator_tuple().get<1>();
142   }
143   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
144   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
145 
146   /* from global to local */
147   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
148   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
149   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
150 
151   /* copy offdiag column indices to map on the CPU */
152   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
153   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
154   auto o_j = d_j.begin();
155   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
157   thrust::sort(thrust::device,o_j,d_j.end());
158   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
159   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
160   noff = thrust::distance(o_j,wit);
161   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
162   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
163   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
165   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
166   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
167   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
168   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
169 
170   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
171   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
172   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
173   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
174   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
175   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
176   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
177   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
178 
179   /* GPU memory, cusparse specific call handles it internally */
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
181   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
182   ierr = PetscFree(jj);CHKERRQ(ierr);
183 
184   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
187   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
188   /*
189   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
190   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
191   */
192   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
193   B->preallocated = PETSC_TRUE;
194   B->nonzerostate++;
195 
196   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
197   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
198   B->offloadmask = PETSC_OFFLOAD_CPU;
199   B->assembled = PETSC_FALSE;
200   B->was_assembled = PETSC_FALSE;
201   PetscFunctionReturn(0);
202 }
203 
204 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
205 {
206   PetscErrorCode         ierr;
207   Mat                    newmat;
208   Mat_MPIAIJ             *mpiaij;
209   Mat_MPIAIJCUSPARSE     *mpidev;
210   PetscInt               coo_basic = 1;
211   PetscMemType           mtype = PETSC_MEMTYPE_DEVICE;
212   PetscInt               rstart,rend;
213   cudaError_t            cerr;
214 
215   PetscFunctionBegin;
216   if (coo_i) {
217     ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
218     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
219     if (PetscMemTypeHost(mtype)) {
220       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
221         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;}
222       }
223     }
224   }
225   /* All ranks must agree on the value of coo_basic */
226   ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
227 
228   if (coo_basic) {
229     ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
230   } else {
231     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
232     ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
233     ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
234     ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr);
235     ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */
236     ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr);
237     ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
238     ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
239     ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
240     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */
241     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
242     ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);CHKERRQ(ierr);
243     ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);CHKERRQ(ierr);
244     ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
245     mpidev->use_extended_coo = PETSC_TRUE;
246 
247     cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr);
248     cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
249     cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr);
250 
251     cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr);
252     cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
253     cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr);
254 
255     cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr);
256     cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
257     cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr);
258 
259     cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr);
260     cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
261     cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr);
262 
263     cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr);
264     cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
265     cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
266 
267     cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
268     cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
269     cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
270 
271     cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
272     cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
273     cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
274 
275     cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
276     cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
277     cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
278 
279     cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
280     cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
281     cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
282 
283     cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
284   }
285   PetscFunctionReturn(0);
286 }
287 
288 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
289 {
290   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
291   const PetscCount  grid_size = gridDim.x * blockDim.x;
292   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
293 }
294 
295 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
296 {
297   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
298   const PetscCount  grid_size = gridDim.x * blockDim.x;
299   for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
300 }
301 
302 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
303 {
304   PetscErrorCode                 ierr;
305   cudaError_t                    cerr;
306   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
307   Mat_MPIAIJCUSPARSE             *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
308   Mat                            A = mpiaij->A,B = mpiaij->B;
309   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
310   PetscScalar                    *Aa,*Ba;
311   PetscScalar                    *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
312   const PetscScalar              *v1 = v;
313   const PetscCount               *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
314   const PetscCount               *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
315   const PetscCount               *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
316   const PetscCount               *Cperm1 = mpidev->Cperm1_d;
317   PetscMemType                   memtype;
318 
319   PetscFunctionBegin;
320   if (mpidev->use_extended_coo) {
321     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
322     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
323       cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
324       cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
325     }
326 
327     if (imode == INSERT_VALUES) {
328       ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
329       ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr);
330       cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
331       cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
332     } else {
333       ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */
334       ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr);
335     }
336 
337     /* Pack entries to be sent to remote */
338     MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
339 
340     /* Send remote entries to their owner and overlap the communication with local computation */
341     ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr);
342     /* Add local entries to A and B */
343     MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
344     MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
345     ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr);
346 
347     /* Add received remote entries to A and B */
348     MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
349     MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
350 
351     if (imode == INSERT_VALUES) {
352       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);
353       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr);
354     } else {
355       ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);
356       ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr);
357     }
358     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
359   } else {
360     ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr);
361   }
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
366 {
367   Mat            Ad,Ao;
368   const PetscInt *cmap;
369   PetscErrorCode ierr;
370 
371   PetscFunctionBegin;
372   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
373   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
374   if (glob) {
375     PetscInt cst, i, dn, on, *gidx;
376 
377     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
378     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
379     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
380     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
381     for (i=0; i<dn; i++) gidx[i]    = cst + i;
382     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
383     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
384   }
385   PetscFunctionReturn(0);
386 }
387 
388 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
389 {
390   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
391   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
392   PetscErrorCode     ierr;
393   PetscInt           i;
394 
395   PetscFunctionBegin;
396   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
397   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
398   if (PetscDefined(USE_DEBUG) && d_nnz) {
399     for (i=0; i<B->rmap->n; i++) {
400       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
401     }
402   }
403   if (PetscDefined(USE_DEBUG) && o_nnz) {
404     for (i=0; i<B->rmap->n; i++) {
405       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
406     }
407   }
408 #if defined(PETSC_USE_CTABLE)
409   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
410 #else
411   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
412 #endif
413   ierr = PetscFree(b->garray);CHKERRQ(ierr);
414   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
415   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
416   /* Because the B will have been resized we simply destroy it and create a new one each time */
417   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
418   if (!b->A) {
419     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
420     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
421     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
422   }
423   if (!b->B) {
424     PetscMPIInt size;
425     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
426     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
427     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
428     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
429   }
430   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
431   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
432   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
433   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
434   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
435   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
436   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
437   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
438   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
439   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
440   /* Let A, B use b's handle with pre-set stream
441   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
442   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
443   */
444   B->preallocated = PETSC_TRUE;
445   PetscFunctionReturn(0);
446 }
447 
448 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
449 {
450   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
451   PetscErrorCode ierr;
452   PetscInt       nt;
453 
454   PetscFunctionBegin;
455   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
456   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
457   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
458   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
459   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
460   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
461   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
462   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
463   PetscFunctionReturn(0);
464 }
465 
466 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
467 {
468   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
469   PetscErrorCode ierr;
470 
471   PetscFunctionBegin;
472   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
473   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
474   PetscFunctionReturn(0);
475 }
476 
477 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
478 {
479   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
480   PetscErrorCode ierr;
481   PetscInt       nt;
482 
483   PetscFunctionBegin;
484   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
485   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
486   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
487   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
488   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
489   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
490   PetscFunctionReturn(0);
491 }
492 
493 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
494 {
495   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
496   PetscErrorCode ierr;
497   PetscInt       nt;
498 
499   PetscFunctionBegin;
500   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
501   PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
502   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
503   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
504   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
505   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
506   PetscFunctionReturn(0);
507 }
508 
509 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
510 {
511   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
512   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
513 
514   PetscFunctionBegin;
515   switch (op) {
516   case MAT_CUSPARSE_MULT_DIAG:
517     cusparseStruct->diagGPUMatFormat = format;
518     break;
519   case MAT_CUSPARSE_MULT_OFFDIAG:
520     cusparseStruct->offdiagGPUMatFormat = format;
521     break;
522   case MAT_CUSPARSE_ALL:
523     cusparseStruct->diagGPUMatFormat    = format;
524     cusparseStruct->offdiagGPUMatFormat = format;
525     break;
526   default:
527     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
528   }
529   PetscFunctionReturn(0);
530 }
531 
532 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
533 {
534   MatCUSPARSEStorageFormat format;
535   PetscErrorCode           ierr;
536   PetscBool                flg;
537   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
538   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
539 
540   PetscFunctionBegin;
541   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
542   if (A->factortype==MAT_FACTOR_NONE) {
543     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
544                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
545     if (flg) {
546       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
547     }
548     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
549                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
550     if (flg) {
551       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
552     }
553     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
554                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
555     if (flg) {
556       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
557     }
558   }
559   ierr = PetscOptionsTail();CHKERRQ(ierr);
560   PetscFunctionReturn(0);
561 }
562 
563 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
564 {
565   PetscErrorCode     ierr;
566   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
567   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
568   PetscObjectState   onnz = A->nonzerostate;
569 
570   PetscFunctionBegin;
571   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
572   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
573   if (onnz != A->nonzerostate && cusp->deviceMat) {
574     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
575     cudaError_t                cerr;
576 
577     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
578     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
579     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
580     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
581     if (h_mat->allocated_indices) {
582       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
583       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
584       if (h_mat->offdiag.j) {
585         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
586         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
587       }
588     }
589     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
590     ierr = PetscFree(h_mat);CHKERRQ(ierr);
591     cusp->deviceMat = NULL;
592   }
593   PetscFunctionReturn(0);
594 }
595 
596 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
597 {
598   PetscErrorCode     ierr;
599   cudaError_t        cerr;
600   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
601   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
602   cusparseStatus_t   stat;
603 
604   PetscFunctionBegin;
605   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
606   if (cusparseStruct->deviceMat) {
607     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
608 
609     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
610     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
611     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
612     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
613     if (h_mat->allocated_indices) {
614       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
615       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
616       if (h_mat->offdiag.j) {
617         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
618         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
619       }
620     }
621     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
622     ierr = PetscFree(h_mat);CHKERRQ(ierr);
623   }
624   try {
625     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
626     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
627     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
628     /* We want cusparseStruct to use PetscDefaultCudaStream
629     if (cusparseStruct->stream) {
630       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
631     }
632     */
633     /* Extended COO */
634     if (cusparseStruct->use_extended_coo) {
635       cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr);
636       cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr);
637       cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr);
638       cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr);
639       cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr);
640       cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr);
641       cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr);
642       cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr);
643       cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr);
644       cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr);
645       cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr);
646       cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr);
647       cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr);
648       cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr);
649       cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr);
650     } else {
651       delete cusparseStruct->coo_p;
652       delete cusparseStruct->coo_pw;
653     }
654     delete cusparseStruct;
655   } catch(char *ex) {
656     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
657   }
658   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
659   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
660   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
661   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
662   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
663   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
664   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
665   PetscFunctionReturn(0);
666 }
667 
668 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
669 {
670   PetscErrorCode     ierr;
671   Mat_MPIAIJ         *a;
672   cusparseStatus_t   stat;
673   Mat                A;
674 
675   PetscFunctionBegin;
676   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
677   if (reuse == MAT_INITIAL_MATRIX) {
678     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
679   } else if (reuse == MAT_REUSE_MATRIX) {
680     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
681   }
682   A = *newmat;
683   A->boundtocpu = PETSC_FALSE;
684   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
685   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
686 
687   a = (Mat_MPIAIJ*)A->data;
688   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
689   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
690   if (a->lvec) {
691     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
692   }
693 
694   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
695     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
696     stat     = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat);
697     a->spptr = cusp;
698   }
699 
700   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
701   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
702   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
703   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
704   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
705   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
706   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
707   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
708 
709   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
710   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
711   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
712   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
713   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
714   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
715 #if defined(PETSC_HAVE_HYPRE)
716   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
717 #endif
718   PetscFunctionReturn(0);
719 }
720 
721 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
722 {
723   PetscErrorCode ierr;
724 
725   PetscFunctionBegin;
726   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
727   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
728   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
729   PetscFunctionReturn(0);
730 }
731 
732 /*@
733    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
734    (the default parallel PETSc format).  This matrix will ultimately pushed down
735    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
736    assembly performance the user should preallocate the matrix storage by setting
737    the parameter nz (or the array nnz).  By setting these parameters accurately,
738    performance during matrix assembly can be increased by more than a factor of 50.
739 
740    Collective
741 
742    Input Parameters:
743 +  comm - MPI communicator, set to PETSC_COMM_SELF
744 .  m - number of rows
745 .  n - number of columns
746 .  nz - number of nonzeros per row (same for all rows)
747 -  nnz - array containing the number of nonzeros in the various rows
748          (possibly different for each row) or NULL
749 
750    Output Parameter:
751 .  A - the matrix
752 
753    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
754    MatXXXXSetPreallocation() paradigm instead of this routine directly.
755    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
756 
757    Notes:
758    If nnz is given then nz is ignored
759 
760    The AIJ format (also called the Yale sparse matrix format or
761    compressed row storage), is fully compatible with standard Fortran 77
762    storage.  That is, the stored row and column indices can begin at
763    either one (as in Fortran) or zero.  See the users' manual for details.
764 
765    Specify the preallocated storage with either nz or nnz (not both).
766    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
767    allocation.  For large problems you MUST preallocate memory or you
768    will get TERRIBLE performance, see the users' manual chapter on matrices.
769 
770    By default, this format uses inodes (identical nodes) when possible, to
771    improve numerical efficiency of matrix-vector products and solves. We
772    search for consecutive rows with the same nonzero structure, thereby
773    reusing matrix information to achieve increased efficiency.
774 
775    Level: intermediate
776 
777 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
778 @*/
779 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
780 {
781   PetscErrorCode ierr;
782   PetscMPIInt    size;
783 
784   PetscFunctionBegin;
785   ierr = MatCreate(comm,A);CHKERRQ(ierr);
786   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
787   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
788   if (size > 1) {
789     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
790     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
791   } else {
792     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
793     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
794   }
795   PetscFunctionReturn(0);
796 }
797 
798 /*MC
799    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
800 
801    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
802    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
803    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
804 
805    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
806    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
807    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
808    for communicators controlling multiple processes.  It is recommended that you call both of
809    the above preallocation routines for simplicity.
810 
811    Options Database Keys:
812 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
813 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
814 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
815 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
816 
817   Level: beginner
818 
819  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
820 M*/
821 
822 /*MC
823    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
824 
825   Level: beginner
826 
827  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
828 M*/
829 
830 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
831 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
832 {
833   PetscSplitCSRDataStructure d_mat;
834   PetscMPIInt                size;
835   PetscErrorCode             ierr;
836   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
837   PetscScalar                *aa = NULL,*ba = NULL;
838   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
839   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
840   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
841 
842   PetscFunctionBegin;
843   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
844   if (A->factortype != MAT_FACTOR_NONE) {
845     *B = NULL;
846     PetscFunctionReturn(0);
847   }
848   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
849   // get jaca
850   if (size == 1) {
851     PetscBool isseqaij;
852 
853     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
854     if (isseqaij) {
855       jaca = (Mat_SeqAIJ*)A->data;
856       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
857       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
858       d_mat = cusparsestructA->deviceMat;
859       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
860     } else {
861       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
862       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
863       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
864       jaca = (Mat_SeqAIJ*)aij->A->data;
865       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
866       d_mat = spptr->deviceMat;
867       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
868     }
869     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
870       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
871       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
872       matrixA = (CsrMatrix*)matstruct->mat;
873       bi = NULL;
874       bj = NULL;
875       ba = NULL;
876     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
877   } else {
878     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
879     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
880     jaca = (Mat_SeqAIJ*)aij->A->data;
881     jacb = (Mat_SeqAIJ*)aij->B->data;
882     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
883 
884     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
885     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
886     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
887     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
888     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
889       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
890       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
891       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
892       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
893       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
894       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
895       matrixA = (CsrMatrix*)matstructA->mat;
896       matrixB = (CsrMatrix*)matstructB->mat;
897       if (jacb->compressedrow.use) {
898         if (!cusparsestructB->rowoffsets_gpu) {
899           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
900           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
901         }
902         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
903       } else {
904         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
905       }
906       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
907       ba = thrust::raw_pointer_cast(matrixB->values->data());
908     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
909     d_mat = spptr->deviceMat;
910   }
911   if (jaca->compressedrow.use) {
912     if (!cusparsestructA->rowoffsets_gpu) {
913       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
914       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
915     }
916     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
917   } else {
918     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
919   }
920   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
921   aa = thrust::raw_pointer_cast(matrixA->values->data());
922 
923   if (!d_mat) {
924     cudaError_t                cerr;
925     PetscSplitCSRDataStructure h_mat;
926 
927     // create and populate strucy on host and copy on device
928     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
929     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
930     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
931     if (size > 1) { /* need the colmap array */
932       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
933       PetscInt   *colmap;
934       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
935 
936       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
937 
938       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
939       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
940 #if defined(PETSC_USE_64BIT_INDICES)
941       { // have to make a long version of these
942         int        *h_bi32, *h_bj32;
943         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
944         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
945         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
946         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
947         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
948         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
949 
950         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
951         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
952         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
953         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
954 
955         h_mat->offdiag.i = d_bi64;
956         h_mat->offdiag.j = d_bj64;
957         h_mat->allocated_indices = PETSC_TRUE;
958 
959         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
960       }
961 #else
962       h_mat->offdiag.i = (PetscInt*)bi;
963       h_mat->offdiag.j = (PetscInt*)bj;
964       h_mat->allocated_indices = PETSC_FALSE;
965 #endif
966       h_mat->offdiag.a = ba;
967       h_mat->offdiag.n = A->rmap->n;
968 
969       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
970       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
971       ierr = PetscFree(colmap);CHKERRQ(ierr);
972     }
973     h_mat->rstart = A->rmap->rstart;
974     h_mat->rend   = A->rmap->rend;
975     h_mat->cstart = A->cmap->rstart;
976     h_mat->cend   = A->cmap->rend;
977     h_mat->M      = A->cmap->N;
978 #if defined(PETSC_USE_64BIT_INDICES)
979     {
980       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
981       int        *h_ai32, *h_aj32;
982       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
983       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
984       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
985       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
986       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
987       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
988 
989       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
990       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
991       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
992       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
993 
994       h_mat->diag.i = d_ai64;
995       h_mat->diag.j = d_aj64;
996       h_mat->allocated_indices = PETSC_TRUE;
997 
998       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
999     }
1000 #else
1001     h_mat->diag.i = (PetscInt*)ai;
1002     h_mat->diag.j = (PetscInt*)aj;
1003     h_mat->allocated_indices = PETSC_FALSE;
1004 #endif
1005     h_mat->diag.a = aa;
1006     h_mat->diag.n = A->rmap->n;
1007     h_mat->rank   = PetscGlobalRank;
1008     // copy pointers and metadata to device
1009     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1010     ierr = PetscFree(h_mat);CHKERRQ(ierr);
1011   } else {
1012     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
1013   }
1014   *B = d_mat;
1015   A->offloadmask = PETSC_OFFLOAD_GPU;
1016   PetscFunctionReturn(0);
1017 }
1018