xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 219fbbafcab96f1b7f27d5f571f31c0ed2202c25)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
25 {
26   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
27   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
28   PetscInt           n = cusp->coo_nd + cusp->coo_no;
29   PetscErrorCode     ierr;
30 
31   PetscFunctionBegin;
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               N,*jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
118   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
119   ierr = PetscFree(b->garray);CHKERRQ(ierr);
120   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
121   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
122   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
123 
124   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
125   d_i.assign(coo_i,coo_i+n);
126   d_j.assign(coo_j,coo_j+n);
127   delete cusp->coo_p;
128   delete cusp->coo_pw;
129   cusp->coo_p = NULL;
130   cusp->coo_pw = NULL;
131   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
132   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
134   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
135     cusp->coo_p = new THRUSTINTARRAY(n);
136     cusp->coo_pw = new THRUSTARRAY(n);
137     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
138     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
139     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
140     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
141     firstoffd = mzipp.get_iterator_tuple().get<1>();
142   }
143   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
144   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
145 
146   /* from global to local */
147   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
148   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
149   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
150 
151   /* copy offdiag column indices to map on the CPU */
152   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
153   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
154   auto o_j = d_j.begin();
155   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
157   thrust::sort(thrust::device,o_j,d_j.end());
158   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
159   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
160   noff = thrust::distance(o_j,wit);
161   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
162   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
163   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
164   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
165   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
166   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
167   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
168   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
169 
170   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
171   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
172   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
173   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
174   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
175   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
176   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
177   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
178 
179   /* GPU memory, cusparse specific call handles it internally */
180   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
181   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
182   ierr = PetscFree(jj);CHKERRQ(ierr);
183 
184   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
185   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
186   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
187   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
188   /*
189   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
190   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
191   */
192   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
193   B->preallocated = PETSC_TRUE;
194   B->nonzerostate++;
195 
196   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
197   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
198   B->offloadmask = PETSC_OFFLOAD_CPU;
199   B->assembled = PETSC_FALSE;
200   B->was_assembled = PETSC_FALSE;
201   PetscFunctionReturn(0);
202 }
203 
204 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
205 {
206   PetscErrorCode         ierr;
207   Mat                    newmat;
208   Mat_MPIAIJ             *mpiaij;
209   Mat_MPIAIJCUSPARSE     *mpidev;
210   PetscInt               coo_basic = 1;
211   PetscMemType           mtype = PETSC_MEMTYPE_DEVICE;
212   PetscInt               rstart,rend;
213   cudaError_t            cerr;
214 
215   PetscFunctionBegin;
216   if (coo_i) {
217     ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
218     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
219     if (PetscMemTypeHost(mtype)) {
220       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
221         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;}
222       }
223     }
224   }
225   /* All ranks must agree on the value of coo_basic */
226   ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
227 
228   if (coo_basic) {
229     ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
230   } else {
231     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
232     ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
233     ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
234     ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr);
235     ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */
236     ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr);
237     ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
238     ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
239     ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
240     ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
241     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */
242     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
243     mpidev->use_extended_coo = PETSC_TRUE;
244 
245     cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr);
246     cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
247     cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr);
248 
249     cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr);
250     cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
251     cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr);
252 
253     cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr);
254     cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
255     cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr);
256 
257     cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr);
258     cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
259     cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr);
260 
261     cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr);
262     cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
263     cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
264 
265     cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
266     cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
267     cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
268 
269     cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
270     cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
271     cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
272 
273     cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
274     cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
275     cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
276 
277     cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
278     cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
279     cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
280 
281     cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
282   }
283   PetscFunctionReturn(0);
284 }
285 
286 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
287 {
288   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
289   const PetscCount  grid_size = gridDim.x * blockDim.x;
290   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
291 }
292 
293 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
294 {
295   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
296   const PetscCount  grid_size = gridDim.x * blockDim.x;
297   for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
298 }
299 
300 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
301 {
302   PetscErrorCode                 ierr;
303   cudaError_t                    cerr;
304   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
305   Mat_MPIAIJCUSPARSE             *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
306   Mat                            A = mpiaij->A,B = mpiaij->B;
307   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
308   PetscScalar                    *Aa,*Ba;
309   PetscScalar                    *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
310   const PetscScalar              *v1 = v;
311   const PetscCount               *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
312   const PetscCount               *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
313   const PetscCount               *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
314   const PetscCount               *Cperm1 = mpidev->Cperm1_d;
315   PetscMemType                   memtype;
316 
317   PetscFunctionBegin;
318   if (mpidev->use_extended_coo) {
319     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
320     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
321       cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
322       if (v) {cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);}
323       else {cerr = cudaMemset((void*)v1,0,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);}
324     }
325 
326     if (imode == INSERT_VALUES) {
327       ierr = MatZeroEntries(A);CHKERRQ(ierr);
328       ierr = MatZeroEntries(B);CHKERRQ(ierr);
329       ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
330       ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr);
331     } else {
332       ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */
333       ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr);
334     }
335 
336     /* Pack entries to be sent to remote */
337     MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
338 
339     /* Send remote entries to their owner and overlap the communication with local computation */
340     ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr);
341     /* Add local entries to A and B */
342     MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
343     MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
344     ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr);
345 
346     /* Add received remote entries to A and B */
347     MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
348     MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
349 
350     if (imode == INSERT_VALUES) {
351       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);
352       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr);
353     } else {
354       ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);
355       ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr);
356     }
357     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
358   } else {
359     ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr);
360   }
361   PetscFunctionReturn(0);
362 }
363 
364 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
365 {
366   Mat            Ad,Ao;
367   const PetscInt *cmap;
368   PetscErrorCode ierr;
369 
370   PetscFunctionBegin;
371   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
372   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
373   if (glob) {
374     PetscInt cst, i, dn, on, *gidx;
375 
376     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
377     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
378     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
379     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
380     for (i=0; i<dn; i++) gidx[i]    = cst + i;
381     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
382     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
383   }
384   PetscFunctionReturn(0);
385 }
386 
387 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
388 {
389   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
390   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
391   PetscErrorCode     ierr;
392   PetscInt           i;
393 
394   PetscFunctionBegin;
395   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
396   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
397   if (PetscDefined(USE_DEBUG) && d_nnz) {
398     for (i=0; i<B->rmap->n; i++) {
399       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
400     }
401   }
402   if (PetscDefined(USE_DEBUG) && o_nnz) {
403     for (i=0; i<B->rmap->n; i++) {
404       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
405     }
406   }
407 #if defined(PETSC_USE_CTABLE)
408   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
409 #else
410   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
411 #endif
412   ierr = PetscFree(b->garray);CHKERRQ(ierr);
413   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
414   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
415   /* Because the B will have been resized we simply destroy it and create a new one each time */
416   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
417   if (!b->A) {
418     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
419     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
420     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
421   }
422   if (!b->B) {
423     PetscMPIInt size;
424     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
425     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
426     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
427     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
428   }
429   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
430   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
431   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
432   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
433   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
434   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
435   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
436   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
437   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
438   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
439   /* Let A, B use b's handle with pre-set stream
440   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
441   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
442   */
443   B->preallocated = PETSC_TRUE;
444   PetscFunctionReturn(0);
445 }
446 
447 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
448 {
449   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
450   PetscErrorCode ierr;
451   PetscInt       nt;
452 
453   PetscFunctionBegin;
454   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
455   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
456   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
457   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
458   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
459   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
460   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
461   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
462   PetscFunctionReturn(0);
463 }
464 
465 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
466 {
467   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
468   PetscErrorCode ierr;
469 
470   PetscFunctionBegin;
471   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
472   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
473   PetscFunctionReturn(0);
474 }
475 
476 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
477 {
478   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
479   PetscErrorCode ierr;
480   PetscInt       nt;
481 
482   PetscFunctionBegin;
483   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
484   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
485   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
486   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
487   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
488   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
489   PetscFunctionReturn(0);
490 }
491 
492 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
493 {
494   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
495   PetscErrorCode ierr;
496   PetscInt       nt;
497 
498   PetscFunctionBegin;
499   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
500   PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
501   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
502   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
503   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
504   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
505   PetscFunctionReturn(0);
506 }
507 
508 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
509 {
510   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
511   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
512 
513   PetscFunctionBegin;
514   switch (op) {
515   case MAT_CUSPARSE_MULT_DIAG:
516     cusparseStruct->diagGPUMatFormat = format;
517     break;
518   case MAT_CUSPARSE_MULT_OFFDIAG:
519     cusparseStruct->offdiagGPUMatFormat = format;
520     break;
521   case MAT_CUSPARSE_ALL:
522     cusparseStruct->diagGPUMatFormat    = format;
523     cusparseStruct->offdiagGPUMatFormat = format;
524     break;
525   default:
526     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
527   }
528   PetscFunctionReturn(0);
529 }
530 
531 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
532 {
533   MatCUSPARSEStorageFormat format;
534   PetscErrorCode           ierr;
535   PetscBool                flg;
536   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
537   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
538 
539   PetscFunctionBegin;
540   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
541   if (A->factortype==MAT_FACTOR_NONE) {
542     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
543                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
544     if (flg) {
545       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
546     }
547     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
548                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
549     if (flg) {
550       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
551     }
552     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
553                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
554     if (flg) {
555       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
556     }
557   }
558   ierr = PetscOptionsTail();CHKERRQ(ierr);
559   PetscFunctionReturn(0);
560 }
561 
562 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
563 {
564   PetscErrorCode     ierr;
565   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
566   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
567   PetscObjectState   onnz = A->nonzerostate;
568 
569   PetscFunctionBegin;
570   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
571   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
572   if (onnz != A->nonzerostate && cusp->deviceMat) {
573     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
574     cudaError_t                cerr;
575 
576     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
577     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
578     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
579     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
580     if (h_mat->allocated_indices) {
581       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
582       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
583       if (h_mat->offdiag.j) {
584         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
585         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
586       }
587     }
588     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
589     ierr = PetscFree(h_mat);CHKERRQ(ierr);
590     cusp->deviceMat = NULL;
591   }
592   PetscFunctionReturn(0);
593 }
594 
595 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
596 {
597   PetscErrorCode     ierr;
598   cudaError_t        cerr;
599   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
600   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
601   cusparseStatus_t   stat;
602 
603   PetscFunctionBegin;
604   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
605   if (cusparseStruct->deviceMat) {
606     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
607 
608     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
609     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
610     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
611     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
612     if (h_mat->allocated_indices) {
613       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
614       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
615       if (h_mat->offdiag.j) {
616         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
617         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
618       }
619     }
620     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
621     ierr = PetscFree(h_mat);CHKERRQ(ierr);
622   }
623   try {
624     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
625     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
626     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
627     /* We want cusparseStruct to use PetscDefaultCudaStream
628     if (cusparseStruct->stream) {
629       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
630     }
631     */
632     /* Extended COO */
633     if (cusparseStruct->use_extended_coo) {
634       cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr);
635       cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr);
636       cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr);
637       cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr);
638       cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr);
639       cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr);
640       cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr);
641       cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr);
642       cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr);
643       cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr);
644       cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr);
645       cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr);
646       cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr);
647       cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr);
648       cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr);
649     } else {
650       delete cusparseStruct->coo_p;
651       delete cusparseStruct->coo_pw;
652     }
653     delete cusparseStruct;
654   } catch(char *ex) {
655     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
656   }
657   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
658   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
659   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
660   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
661   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
662   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
663   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
668 {
669   PetscErrorCode     ierr;
670   Mat_MPIAIJ         *a;
671   cusparseStatus_t   stat;
672   Mat                A;
673 
674   PetscFunctionBegin;
675   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
676   if (reuse == MAT_INITIAL_MATRIX) {
677     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
678   } else if (reuse == MAT_REUSE_MATRIX) {
679     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
680   }
681   A = *newmat;
682   A->boundtocpu = PETSC_FALSE;
683   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
684   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
685 
686   a = (Mat_MPIAIJ*)A->data;
687   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
688   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
689   if (a->lvec) {
690     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
691   }
692 
693   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
694     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
695     stat     = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat);
696     a->spptr = cusp;
697   }
698 
699   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
700   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
701   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
702   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
703   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
704   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
705   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
706   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
707 
708   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
709   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
710   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
711   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
712   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
713   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
714 #if defined(PETSC_HAVE_HYPRE)
715   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
716 #endif
717   PetscFunctionReturn(0);
718 }
719 
720 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
721 {
722   PetscErrorCode ierr;
723 
724   PetscFunctionBegin;
725   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
726   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
727   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
728   PetscFunctionReturn(0);
729 }
730 
731 /*@
732    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
733    (the default parallel PETSc format).  This matrix will ultimately pushed down
734    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
735    assembly performance the user should preallocate the matrix storage by setting
736    the parameter nz (or the array nnz).  By setting these parameters accurately,
737    performance during matrix assembly can be increased by more than a factor of 50.
738 
739    Collective
740 
741    Input Parameters:
742 +  comm - MPI communicator, set to PETSC_COMM_SELF
743 .  m - number of rows
744 .  n - number of columns
745 .  nz - number of nonzeros per row (same for all rows)
746 -  nnz - array containing the number of nonzeros in the various rows
747          (possibly different for each row) or NULL
748 
749    Output Parameter:
750 .  A - the matrix
751 
752    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
753    MatXXXXSetPreallocation() paradigm instead of this routine directly.
754    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
755 
756    Notes:
757    If nnz is given then nz is ignored
758 
759    The AIJ format (also called the Yale sparse matrix format or
760    compressed row storage), is fully compatible with standard Fortran 77
761    storage.  That is, the stored row and column indices can begin at
762    either one (as in Fortran) or zero.  See the users' manual for details.
763 
764    Specify the preallocated storage with either nz or nnz (not both).
765    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
766    allocation.  For large problems you MUST preallocate memory or you
767    will get TERRIBLE performance, see the users' manual chapter on matrices.
768 
769    By default, this format uses inodes (identical nodes) when possible, to
770    improve numerical efficiency of matrix-vector products and solves. We
771    search for consecutive rows with the same nonzero structure, thereby
772    reusing matrix information to achieve increased efficiency.
773 
774    Level: intermediate
775 
776 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
777 @*/
778 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
779 {
780   PetscErrorCode ierr;
781   PetscMPIInt    size;
782 
783   PetscFunctionBegin;
784   ierr = MatCreate(comm,A);CHKERRQ(ierr);
785   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
786   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
787   if (size > 1) {
788     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
789     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
790   } else {
791     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
792     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
793   }
794   PetscFunctionReturn(0);
795 }
796 
797 /*MC
798    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
799 
800    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
801    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
802    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
803 
804    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
805    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
806    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
807    for communicators controlling multiple processes.  It is recommended that you call both of
808    the above preallocation routines for simplicity.
809 
810    Options Database Keys:
811 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
812 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
813 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
814 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
815 
816   Level: beginner
817 
818  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
819 M*/
820 
821 /*MC
822    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
823 
824   Level: beginner
825 
826  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
827 M*/
828 
829 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
830 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
831 {
832   PetscSplitCSRDataStructure d_mat;
833   PetscMPIInt                size;
834   PetscErrorCode             ierr;
835   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
836   PetscScalar                *aa = NULL,*ba = NULL;
837   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
838   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
839   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
840 
841   PetscFunctionBegin;
842   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
843   if (A->factortype != MAT_FACTOR_NONE) {
844     *B = NULL;
845     PetscFunctionReturn(0);
846   }
847   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
848   // get jaca
849   if (size == 1) {
850     PetscBool isseqaij;
851 
852     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
853     if (isseqaij) {
854       jaca = (Mat_SeqAIJ*)A->data;
855       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
856       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
857       d_mat = cusparsestructA->deviceMat;
858       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
859     } else {
860       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
861       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
862       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
863       jaca = (Mat_SeqAIJ*)aij->A->data;
864       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
865       d_mat = spptr->deviceMat;
866       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
867     }
868     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
869       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
870       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
871       matrixA = (CsrMatrix*)matstruct->mat;
872       bi = NULL;
873       bj = NULL;
874       ba = NULL;
875     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
876   } else {
877     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
878     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
879     jaca = (Mat_SeqAIJ*)aij->A->data;
880     jacb = (Mat_SeqAIJ*)aij->B->data;
881     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
882 
883     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
884     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
885     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
886     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
887     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
888       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
889       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
890       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
891       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
892       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
893       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
894       matrixA = (CsrMatrix*)matstructA->mat;
895       matrixB = (CsrMatrix*)matstructB->mat;
896       if (jacb->compressedrow.use) {
897         if (!cusparsestructB->rowoffsets_gpu) {
898           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
899           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
900         }
901         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
902       } else {
903         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
904       }
905       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
906       ba = thrust::raw_pointer_cast(matrixB->values->data());
907     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
908     d_mat = spptr->deviceMat;
909   }
910   if (jaca->compressedrow.use) {
911     if (!cusparsestructA->rowoffsets_gpu) {
912       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
913       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
914     }
915     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
916   } else {
917     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
918   }
919   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
920   aa = thrust::raw_pointer_cast(matrixA->values->data());
921 
922   if (!d_mat) {
923     cudaError_t                cerr;
924     PetscSplitCSRDataStructure h_mat;
925 
926     // create and populate strucy on host and copy on device
927     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
928     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
929     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
930     if (size > 1) { /* need the colmap array */
931       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
932       PetscInt   *colmap;
933       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
934 
935       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
936 
937       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
938       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
939 #if defined(PETSC_USE_64BIT_INDICES)
940       { // have to make a long version of these
941         int        *h_bi32, *h_bj32;
942         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
943         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
944         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
945         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
946         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
947         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
948 
949         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
950         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
951         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
952         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
953 
954         h_mat->offdiag.i = d_bi64;
955         h_mat->offdiag.j = d_bj64;
956         h_mat->allocated_indices = PETSC_TRUE;
957 
958         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
959       }
960 #else
961       h_mat->offdiag.i = (PetscInt*)bi;
962       h_mat->offdiag.j = (PetscInt*)bj;
963       h_mat->allocated_indices = PETSC_FALSE;
964 #endif
965       h_mat->offdiag.a = ba;
966       h_mat->offdiag.n = A->rmap->n;
967 
968       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
969       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
970       ierr = PetscFree(colmap);CHKERRQ(ierr);
971     }
972     h_mat->rstart = A->rmap->rstart;
973     h_mat->rend   = A->rmap->rend;
974     h_mat->cstart = A->cmap->rstart;
975     h_mat->cend   = A->cmap->rend;
976     h_mat->M      = A->cmap->N;
977 #if defined(PETSC_USE_64BIT_INDICES)
978     {
979       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
980       int        *h_ai32, *h_aj32;
981       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
982       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
983       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
984       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
985       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
986       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
987 
988       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
989       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
990       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
991       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
992 
993       h_mat->diag.i = d_ai64;
994       h_mat->diag.j = d_aj64;
995       h_mat->allocated_indices = PETSC_TRUE;
996 
997       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
998     }
999 #else
1000     h_mat->diag.i = (PetscInt*)ai;
1001     h_mat->diag.j = (PetscInt*)aj;
1002     h_mat->allocated_indices = PETSC_FALSE;
1003 #endif
1004     h_mat->diag.a = aa;
1005     h_mat->diag.n = A->rmap->n;
1006     h_mat->rank   = PetscGlobalRank;
1007     // copy pointers and metadata to device
1008     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1009     ierr = PetscFree(h_mat);CHKERRQ(ierr);
1010   } else {
1011     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
1012   }
1013   *B = d_mat;
1014   A->offloadmask = PETSC_OFFLOAD_GPU;
1015   PetscFunctionReturn(0);
1016 }
1017