xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision c87e60e88412cb95f0de7b35b2cfeb2d36afaa13)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
25 {
26   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
27   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
28   PetscInt           n = cusp->coo_nd + cusp->coo_no;
29   PetscErrorCode     ierr;
30 
31   PetscFunctionBegin;
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               N,*jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
118   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
119   ierr = PetscFree(b->garray);CHKERRQ(ierr);
120   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
122   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
123 
124   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
125   d_i.assign(coo_i,coo_i+n);
126   d_j.assign(coo_j,coo_j+n);
127   delete cusp->coo_p;
128   delete cusp->coo_pw;
129   cusp->coo_p = NULL;
130   cusp->coo_pw = NULL;
131   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
132   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
134   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
135     cusp->coo_p = new THRUSTINTARRAY(n);
136     cusp->coo_pw = new THRUSTARRAY(n);
137     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
138     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
139     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
140     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
141     firstoffd = mzipp.get_iterator_tuple().get<1>();
142   }
143   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
144   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
145 
146   /* from global to local */
147   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
148   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
149   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
150 
151   /* copy offdiag column indices to map on the CPU */
152   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
153   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
154   auto o_j = d_j.begin();
155   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
157   thrust::sort(thrust::device,o_j,d_j.end());
158   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
159   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
160   noff = thrust::distance(o_j,wit);
161   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
162   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
163   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
165   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
166   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
167   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
168   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
169 
170   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
171   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
172   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
173   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
174   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
175   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
176   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
177   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
178 
179   /* GPU memory, cusparse specific call handles it internally */
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
181   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
182   ierr = PetscFree(jj);CHKERRQ(ierr);
183 
184   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
187   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
188   /*
189   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
190   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
191   */
192   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
193   B->preallocated = PETSC_TRUE;
194   B->nonzerostate++;
195 
196   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
197   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
198   B->offloadmask = PETSC_OFFLOAD_CPU;
199   B->assembled = PETSC_FALSE;
200   B->was_assembled = PETSC_FALSE;
201   PetscFunctionReturn(0);
202 }
203 
204 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
205 {
206   PetscErrorCode         ierr;
207   Mat                    newmat;
208   Mat_MPIAIJ             *mpiaij;
209   Mat_MPIAIJCUSPARSE     *mpidev;
210   PetscInt               coo_basic = 1;
211   PetscMemType           mtype = PETSC_MEMTYPE_DEVICE;
212   PetscInt               rstart,rend;
213   cudaError_t            cerr;
214 
215   PetscFunctionBegin;
216   if (coo_i) {
217     ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
218     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
219     if (PetscMemTypeHost(mtype)) {
220       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
221         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;}
222       }
223     }
224   }
225   /* All ranks must agree on the value of coo_basic */
226   ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
227 
228   if (coo_basic) {
229     ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
230   } else {
231     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
232     ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
233     ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
234     ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr);
235     ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */
236     ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr);
237     ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
238     ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
239     ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
240     ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
241     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */
242     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
243     mpidev->use_extended_coo = PETSC_TRUE;
244 
245     cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr);
246     cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
247     cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr);
248 
249     cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr);
250     cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
251     cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr);
252 
253     cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr);
254     cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
255     cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr);
256 
257     cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr);
258     cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
259     cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr);
260 
261     cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr);
262     cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
263     cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
264 
265     cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
266     cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
267     cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
268 
269     cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
270     cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
271     cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
272 
273     cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
274     cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
275     cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
276 
277     cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
278     cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
279     cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
280 
281     cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
282   }
283   PetscFunctionReturn(0);
284 }
285 
286 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
287 {
288   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
289   const PetscCount  grid_size = gridDim.x * blockDim.x;
290   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
291 }
292 
293 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
294 {
295   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
296   const PetscCount  grid_size = gridDim.x * blockDim.x;
297   for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
298 }
299 
300 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
301 {
302   PetscErrorCode                 ierr;
303   cudaError_t                    cerr;
304   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
305   Mat_MPIAIJCUSPARSE             *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
306   Mat                            A = mpiaij->A,B = mpiaij->B;
307   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
308   PetscScalar                    *Aa,*Ba;
309   PetscScalar                    *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
310   const PetscScalar              *v1 = v;
311   const PetscCount               *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
312   const PetscCount               *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
313   const PetscCount               *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
314   const PetscCount               *Cperm1 = mpidev->Cperm1_d;
315   PetscMemType                   memtype;
316 
317   PetscFunctionBegin;
318   if (mpidev->use_extended_coo) {
319     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
320     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
321       cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
322       cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
323     }
324 
325     if (imode == INSERT_VALUES) {
326       ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
327       ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr);
328       cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
329       cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
330     } else {
331       ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */
332       ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr);
333     }
334 
335     /* Pack entries to be sent to remote */
336     MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
337 
338     /* Send remote entries to their owner and overlap the communication with local computation */
339     ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr);
340     /* Add local entries to A and B */
341     MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
342     MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
343     ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr);
344 
345     /* Add received remote entries to A and B */
346     MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
347     MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
348 
349     if (imode == INSERT_VALUES) {
350       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);
351       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr);
352     } else {
353       ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);
354       ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr);
355     }
356     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
357   } else {
358     ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr);
359   }
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
364 {
365   Mat            Ad,Ao;
366   const PetscInt *cmap;
367   PetscErrorCode ierr;
368 
369   PetscFunctionBegin;
370   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
371   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
372   if (glob) {
373     PetscInt cst, i, dn, on, *gidx;
374 
375     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
376     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
377     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
378     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
379     for (i=0; i<dn; i++) gidx[i]    = cst + i;
380     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
381     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
387 {
388   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
389   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
390   PetscErrorCode     ierr;
391   PetscInt           i;
392 
393   PetscFunctionBegin;
394   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
395   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
396   if (PetscDefined(USE_DEBUG) && d_nnz) {
397     for (i=0; i<B->rmap->n; i++) {
398       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
399     }
400   }
401   if (PetscDefined(USE_DEBUG) && o_nnz) {
402     for (i=0; i<B->rmap->n; i++) {
403       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
404     }
405   }
406 #if defined(PETSC_USE_CTABLE)
407   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
408 #else
409   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
410 #endif
411   ierr = PetscFree(b->garray);CHKERRQ(ierr);
412   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
413   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
414   /* Because the B will have been resized we simply destroy it and create a new one each time */
415   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
416   if (!b->A) {
417     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
418     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
419     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
420   }
421   if (!b->B) {
422     PetscMPIInt size;
423     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
424     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
425     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
426     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
427   }
428   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
429   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
430   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
431   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
432   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
433   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
434   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
435   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
436   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
437   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
438   /* Let A, B use b's handle with pre-set stream
439   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
440   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
441   */
442   B->preallocated = PETSC_TRUE;
443   PetscFunctionReturn(0);
444 }
445 
446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
447 {
448   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
449   PetscErrorCode ierr;
450   PetscInt       nt;
451 
452   PetscFunctionBegin;
453   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
454   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
455   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
456   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
457   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
458   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
459   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
460   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
461   PetscFunctionReturn(0);
462 }
463 
464 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
465 {
466   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
467   PetscErrorCode ierr;
468 
469   PetscFunctionBegin;
470   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
471   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
472   PetscFunctionReturn(0);
473 }
474 
475 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
476 {
477   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
478   PetscErrorCode ierr;
479   PetscInt       nt;
480 
481   PetscFunctionBegin;
482   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
483   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
484   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
485   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
486   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
487   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
488   PetscFunctionReturn(0);
489 }
490 
491 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
492 {
493   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
494   PetscErrorCode ierr;
495   PetscInt       nt;
496 
497   PetscFunctionBegin;
498   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
499   PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
500   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
501   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
502   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
503   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
508 {
509   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
510   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
511 
512   PetscFunctionBegin;
513   switch (op) {
514   case MAT_CUSPARSE_MULT_DIAG:
515     cusparseStruct->diagGPUMatFormat = format;
516     break;
517   case MAT_CUSPARSE_MULT_OFFDIAG:
518     cusparseStruct->offdiagGPUMatFormat = format;
519     break;
520   case MAT_CUSPARSE_ALL:
521     cusparseStruct->diagGPUMatFormat    = format;
522     cusparseStruct->offdiagGPUMatFormat = format;
523     break;
524   default:
525     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
526   }
527   PetscFunctionReturn(0);
528 }
529 
530 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
531 {
532   MatCUSPARSEStorageFormat format;
533   PetscErrorCode           ierr;
534   PetscBool                flg;
535   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
536   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
537 
538   PetscFunctionBegin;
539   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
540   if (A->factortype==MAT_FACTOR_NONE) {
541     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
542                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
543     if (flg) {
544       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
545     }
546     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
547                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
548     if (flg) {
549       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
550     }
551     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
552                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
553     if (flg) {
554       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
555     }
556   }
557   ierr = PetscOptionsTail();CHKERRQ(ierr);
558   PetscFunctionReturn(0);
559 }
560 
561 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
562 {
563   PetscErrorCode     ierr;
564   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
565   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
566   PetscObjectState   onnz = A->nonzerostate;
567 
568   PetscFunctionBegin;
569   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
570   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
571   if (onnz != A->nonzerostate && cusp->deviceMat) {
572     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
573     cudaError_t                cerr;
574 
575     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
576     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
577     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
578     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
579     if (h_mat->allocated_indices) {
580       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
581       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
582       if (h_mat->offdiag.j) {
583         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
584         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
585       }
586     }
587     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
588     ierr = PetscFree(h_mat);CHKERRQ(ierr);
589     cusp->deviceMat = NULL;
590   }
591   PetscFunctionReturn(0);
592 }
593 
594 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
595 {
596   PetscErrorCode     ierr;
597   cudaError_t        cerr;
598   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
599   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
600   cusparseStatus_t   stat;
601 
602   PetscFunctionBegin;
603   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
604   if (cusparseStruct->deviceMat) {
605     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
606 
607     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
608     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
609     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
610     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
611     if (h_mat->allocated_indices) {
612       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
613       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
614       if (h_mat->offdiag.j) {
615         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
616         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
617       }
618     }
619     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
620     ierr = PetscFree(h_mat);CHKERRQ(ierr);
621   }
622   try {
623     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
624     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
625     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
626     /* We want cusparseStruct to use PetscDefaultCudaStream
627     if (cusparseStruct->stream) {
628       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
629     }
630     */
631     /* Extended COO */
632     if (cusparseStruct->use_extended_coo) {
633       cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr);
634       cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr);
635       cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr);
636       cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr);
637       cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr);
638       cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr);
639       cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr);
640       cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr);
641       cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr);
642       cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr);
643       cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr);
644       cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr);
645       cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr);
646       cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr);
647       cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr);
648     } else {
649       delete cusparseStruct->coo_p;
650       delete cusparseStruct->coo_pw;
651     }
652     delete cusparseStruct;
653   } catch(char *ex) {
654     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
655   }
656   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
657   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
658   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
659   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
660   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
661   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
662   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
663   PetscFunctionReturn(0);
664 }
665 
666 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
667 {
668   PetscErrorCode     ierr;
669   Mat_MPIAIJ         *a;
670   cusparseStatus_t   stat;
671   Mat                A;
672 
673   PetscFunctionBegin;
674   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
675   if (reuse == MAT_INITIAL_MATRIX) {
676     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
677   } else if (reuse == MAT_REUSE_MATRIX) {
678     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
679   }
680   A = *newmat;
681   A->boundtocpu = PETSC_FALSE;
682   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
683   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
684 
685   a = (Mat_MPIAIJ*)A->data;
686   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
687   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
688   if (a->lvec) {
689     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
690   }
691 
692   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
693     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
694     stat     = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat);
695     a->spptr = cusp;
696   }
697 
698   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
699   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
700   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
701   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
702   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
703   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
704   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
705   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
706 
707   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
708   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
709   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
710   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
711   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
712   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
713 #if defined(PETSC_HAVE_HYPRE)
714   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
715 #endif
716   PetscFunctionReturn(0);
717 }
718 
719 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
720 {
721   PetscErrorCode ierr;
722 
723   PetscFunctionBegin;
724   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
725   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
726   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 /*@
731    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
732    (the default parallel PETSc format).  This matrix will ultimately pushed down
733    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
734    assembly performance the user should preallocate the matrix storage by setting
735    the parameter nz (or the array nnz).  By setting these parameters accurately,
736    performance during matrix assembly can be increased by more than a factor of 50.
737 
738    Collective
739 
740    Input Parameters:
741 +  comm - MPI communicator, set to PETSC_COMM_SELF
742 .  m - number of rows
743 .  n - number of columns
744 .  nz - number of nonzeros per row (same for all rows)
745 -  nnz - array containing the number of nonzeros in the various rows
746          (possibly different for each row) or NULL
747 
748    Output Parameter:
749 .  A - the matrix
750 
751    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
752    MatXXXXSetPreallocation() paradigm instead of this routine directly.
753    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
754 
755    Notes:
756    If nnz is given then nz is ignored
757 
758    The AIJ format (also called the Yale sparse matrix format or
759    compressed row storage), is fully compatible with standard Fortran 77
760    storage.  That is, the stored row and column indices can begin at
761    either one (as in Fortran) or zero.  See the users' manual for details.
762 
763    Specify the preallocated storage with either nz or nnz (not both).
764    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
765    allocation.  For large problems you MUST preallocate memory or you
766    will get TERRIBLE performance, see the users' manual chapter on matrices.
767 
768    By default, this format uses inodes (identical nodes) when possible, to
769    improve numerical efficiency of matrix-vector products and solves. We
770    search for consecutive rows with the same nonzero structure, thereby
771    reusing matrix information to achieve increased efficiency.
772 
773    Level: intermediate
774 
775 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
776 @*/
777 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
778 {
779   PetscErrorCode ierr;
780   PetscMPIInt    size;
781 
782   PetscFunctionBegin;
783   ierr = MatCreate(comm,A);CHKERRQ(ierr);
784   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
785   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
786   if (size > 1) {
787     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
788     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
789   } else {
790     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
791     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
792   }
793   PetscFunctionReturn(0);
794 }
795 
796 /*MC
797    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
798 
799    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
800    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
801    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
802 
803    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
804    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
805    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
806    for communicators controlling multiple processes.  It is recommended that you call both of
807    the above preallocation routines for simplicity.
808 
809    Options Database Keys:
810 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
811 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
812 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
813 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
814 
815   Level: beginner
816 
817  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
818 M*/
819 
820 /*MC
821    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
822 
823   Level: beginner
824 
825  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
826 M*/
827 
828 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
829 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
830 {
831   PetscSplitCSRDataStructure d_mat;
832   PetscMPIInt                size;
833   PetscErrorCode             ierr;
834   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
835   PetscScalar                *aa = NULL,*ba = NULL;
836   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
837   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
838   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
839 
840   PetscFunctionBegin;
841   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
842   if (A->factortype != MAT_FACTOR_NONE) {
843     *B = NULL;
844     PetscFunctionReturn(0);
845   }
846   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
847   // get jaca
848   if (size == 1) {
849     PetscBool isseqaij;
850 
851     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
852     if (isseqaij) {
853       jaca = (Mat_SeqAIJ*)A->data;
854       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
855       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
856       d_mat = cusparsestructA->deviceMat;
857       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
858     } else {
859       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
860       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
861       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
862       jaca = (Mat_SeqAIJ*)aij->A->data;
863       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
864       d_mat = spptr->deviceMat;
865       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
866     }
867     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
868       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
869       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
870       matrixA = (CsrMatrix*)matstruct->mat;
871       bi = NULL;
872       bj = NULL;
873       ba = NULL;
874     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
875   } else {
876     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
877     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
878     jaca = (Mat_SeqAIJ*)aij->A->data;
879     jacb = (Mat_SeqAIJ*)aij->B->data;
880     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
881 
882     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
883     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
884     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
885     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
886     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
887       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
888       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
889       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
890       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
891       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
892       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
893       matrixA = (CsrMatrix*)matstructA->mat;
894       matrixB = (CsrMatrix*)matstructB->mat;
895       if (jacb->compressedrow.use) {
896         if (!cusparsestructB->rowoffsets_gpu) {
897           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
898           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
899         }
900         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
901       } else {
902         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
903       }
904       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
905       ba = thrust::raw_pointer_cast(matrixB->values->data());
906     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
907     d_mat = spptr->deviceMat;
908   }
909   if (jaca->compressedrow.use) {
910     if (!cusparsestructA->rowoffsets_gpu) {
911       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
912       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
913     }
914     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
915   } else {
916     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
917   }
918   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
919   aa = thrust::raw_pointer_cast(matrixA->values->data());
920 
921   if (!d_mat) {
922     cudaError_t                cerr;
923     PetscSplitCSRDataStructure h_mat;
924 
925     // create and populate strucy on host and copy on device
926     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
927     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
928     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
929     if (size > 1) { /* need the colmap array */
930       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
931       PetscInt   *colmap;
932       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
933 
934       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
935 
936       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
937       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
938 #if defined(PETSC_USE_64BIT_INDICES)
939       { // have to make a long version of these
940         int        *h_bi32, *h_bj32;
941         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
942         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
943         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
944         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
945         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
946         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
947 
948         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
949         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
950         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
951         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
952 
953         h_mat->offdiag.i = d_bi64;
954         h_mat->offdiag.j = d_bj64;
955         h_mat->allocated_indices = PETSC_TRUE;
956 
957         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
958       }
959 #else
960       h_mat->offdiag.i = (PetscInt*)bi;
961       h_mat->offdiag.j = (PetscInt*)bj;
962       h_mat->allocated_indices = PETSC_FALSE;
963 #endif
964       h_mat->offdiag.a = ba;
965       h_mat->offdiag.n = A->rmap->n;
966 
967       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
968       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
969       ierr = PetscFree(colmap);CHKERRQ(ierr);
970     }
971     h_mat->rstart = A->rmap->rstart;
972     h_mat->rend   = A->rmap->rend;
973     h_mat->cstart = A->cmap->rstart;
974     h_mat->cend   = A->cmap->rend;
975     h_mat->M      = A->cmap->N;
976 #if defined(PETSC_USE_64BIT_INDICES)
977     {
978       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
979       int        *h_ai32, *h_aj32;
980       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
981       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
982       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
983       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
984       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
985       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
986 
987       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
988       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
989       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
990       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
991 
992       h_mat->diag.i = d_ai64;
993       h_mat->diag.j = d_aj64;
994       h_mat->allocated_indices = PETSC_TRUE;
995 
996       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
997     }
998 #else
999     h_mat->diag.i = (PetscInt*)ai;
1000     h_mat->diag.j = (PetscInt*)aj;
1001     h_mat->allocated_indices = PETSC_FALSE;
1002 #endif
1003     h_mat->diag.a = aa;
1004     h_mat->diag.n = A->rmap->n;
1005     h_mat->rank   = PetscGlobalRank;
1006     // copy pointers and metadata to device
1007     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1008     ierr = PetscFree(h_mat);CHKERRQ(ierr);
1009   } else {
1010     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
1011   }
1012   *B = d_mat;
1013   A->offloadmask = PETSC_OFFLOAD_GPU;
1014   PetscFunctionReturn(0);
1015 }
1016