xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 3c859ba3a04a72e8efcb87bc7ffc046a6cbab413)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25 {
26   cudaError_t        cerr;
27   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
28   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
29 
30   PetscFunctionBegin;
31   if (!cusparseStruct) PetscFunctionReturn(0);
32   if (cusparseStruct->use_extended_coo) {
33     cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr);
34     cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr);
35     cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr);
36     cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr);
37     cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr);
38     cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr);
39     cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr);
40     cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr);
41     cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr);
42     cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr);
43     cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr);
44     cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr);
45     cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr);
46     cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr);
47     cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr);
48   }
49   cusparseStruct->use_extended_coo = PETSC_FALSE;
50   delete cusparseStruct->coo_p;
51   delete cusparseStruct->coo_pw;
52   cusparseStruct->coo_p = NULL;
53   cusparseStruct->coo_pw = NULL;
54   PetscFunctionReturn(0);
55 }
56 
57 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
58 {
59   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
60   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
61   PetscInt           n = cusp->coo_nd + cusp->coo_no;
62   PetscErrorCode     ierr;
63 
64   PetscFunctionBegin;
65   if (cusp->coo_p && v) {
66     thrust::device_ptr<const PetscScalar> d_v;
67     THRUSTARRAY                           *w = NULL;
68 
69     if (isCudaMem(v)) {
70       d_v = thrust::device_pointer_cast(v);
71     } else {
72       w = new THRUSTARRAY(n);
73       w->assign(v,v+n);
74       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
75       d_v = w->data();
76     }
77 
78     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
79                                                               cusp->coo_pw->begin()));
80     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
81                                                               cusp->coo_pw->end()));
82     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
83     thrust::for_each(zibit,zieit,VecCUDAEquals());
84     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
85     delete w;
86     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
87     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
88   } else {
89     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr);
90     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
91   }
92   PetscFunctionReturn(0);
93 }
94 
95 template <typename Tuple>
96 struct IsNotOffDiagT
97 {
98   PetscInt _cstart,_cend;
99 
100   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
101   __host__ __device__
102   inline bool operator()(Tuple t)
103   {
104     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
105   }
106 };
107 
108 struct IsOffDiag
109 {
110   PetscInt _cstart,_cend;
111 
112   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
113   __host__ __device__
114   inline bool operator() (const PetscInt &c)
115   {
116     return c < _cstart || c >= _cend;
117   }
118 };
119 
120 struct GlobToLoc
121 {
122   PetscInt _start;
123 
124   GlobToLoc(PetscInt start) : _start(start) {}
125   __host__ __device__
126   inline PetscInt operator() (const PetscInt &c)
127   {
128     return c - _start;
129   }
130 };
131 
132 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
133 {
134   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
135   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
136   PetscErrorCode         ierr;
137   PetscInt               N,*jj;
138   size_t                 noff = 0;
139   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
140   THRUSTINTARRAY         d_j(n);
141   ISLocalToGlobalMapping l2g;
142   cudaError_t            cerr;
143 
144   PetscFunctionBegin;
145   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
146   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
147   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
148   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
149 
150   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
151   d_i.assign(coo_i,coo_i+n);
152   d_j.assign(coo_j,coo_j+n);
153   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
154   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
155   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
156   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
157     cusp->coo_p = new THRUSTINTARRAY(n);
158     cusp->coo_pw = new THRUSTARRAY(n);
159     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
160     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
161     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
162     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
163     firstoffd = mzipp.get_iterator_tuple().get<1>();
164   }
165   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
166   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
167 
168   /* from global to local */
169   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
170   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
171   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
172 
173   /* copy offdiag column indices to map on the CPU */
174   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
175   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
176   auto o_j = d_j.begin();
177   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
178   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
179   thrust::sort(thrust::device,o_j,d_j.end());
180   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
181   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
182   noff = thrust::distance(o_j,wit);
183   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
184   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
185   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
186   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
187   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
188   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
189   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
190   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
191   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
192 
193   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
194   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
195   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
196   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
197   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
198   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
199   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
200   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
201 
202   /* GPU memory, cusparse specific call handles it internally */
203   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
204   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
205   ierr = PetscFree(jj);CHKERRQ(ierr);
206 
207   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
208   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
209   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
210   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
211   /*
212   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
213   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
214   */
215 
216   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
217   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
218   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
223 {
224   PetscErrorCode         ierr;
225   Mat_MPIAIJ             *mpiaij = (Mat_MPIAIJ*)mat->data;
226   Mat_MPIAIJCUSPARSE     *mpidev;
227   PetscBool              coo_basic = PETSC_TRUE;
228   PetscMemType           mtype = PETSC_MEMTYPE_DEVICE;
229   PetscInt               rstart,rend;
230   cudaError_t            cerr;
231 
232   PetscFunctionBegin;
233   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
234   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
235 #if defined(PETSC_USE_CTABLE)
236   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
237 #else
238   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
239 #endif
240   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
241   mat->assembled = PETSC_FALSE;
242   mat->was_assembled = PETSC_FALSE;
243   ierr = MatResetPreallocationCOO_MPIAIJ(mat);CHKERRQ(ierr);
244   ierr = MatResetPreallocationCOO_MPIAIJCUSPARSE(mat);CHKERRQ(ierr);
245   if (coo_i) {
246     ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
247     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
248     if (PetscMemTypeHost(mtype)) {
249       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
250         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
251       }
252     }
253   }
254   /* All ranks must agree on the value of coo_basic */
255   ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
256   if (coo_basic) {
257     ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
258   } else {
259     ierr = MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
260     mat->offloadmask = PETSC_OFFLOAD_CPU;
261     /* creates the GPU memory */
262     ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);CHKERRQ(ierr);
263     ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);CHKERRQ(ierr);
264     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
265     mpidev->use_extended_coo = PETSC_TRUE;
266 
267     cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr);
268     cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
269     cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr);
270 
271     cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr);
272     cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
273     cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr);
274 
275     cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr);
276     cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
277     cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr);
278 
279     cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr);
280     cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
281     cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr);
282 
283     cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr);
284     cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
285     cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
286 
287     cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
288     cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
289     cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
290 
291     cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
292     cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
293     cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
294 
295     cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
296     cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
297     cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
298 
299     cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
300     cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
301     cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
302 
303     cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
304   }
305   PetscFunctionReturn(0);
306 }
307 
308 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
309 {
310   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
311   const PetscCount  grid_size = gridDim.x * blockDim.x;
312   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
313 }
314 
315 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
316 {
317   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
318   const PetscCount  grid_size = gridDim.x * blockDim.x;
319   for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
320 }
321 
322 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
323 {
324   PetscErrorCode                 ierr;
325   cudaError_t                    cerr;
326   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
327   Mat_MPIAIJCUSPARSE             *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
328   Mat                            A = mpiaij->A,B = mpiaij->B;
329   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
330   PetscScalar                    *Aa,*Ba = NULL;
331   PetscScalar                    *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
332   const PetscScalar              *v1 = v;
333   const PetscCount               *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
334   const PetscCount               *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
335   const PetscCount               *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
336   const PetscCount               *Cperm1 = mpidev->Cperm1_d;
337   PetscMemType                   memtype;
338 
339   PetscFunctionBegin;
340   if (mpidev->use_extended_coo) {
341     PetscMPIInt size;
342 
343     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
344     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
345     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
346       cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
347       cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
348     }
349 
350     if (imode == INSERT_VALUES) {
351       ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
352       cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
353       if (size > 1) {
354         ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr);
355         cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
356       }
357     } else {
358       ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */
359       if (size > 1) { ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); }
360     }
361 
362     /* Pack entries to be sent to remote */
363     if (mpiaij->sendlen) {
364       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
365       CHKERRCUDA(cudaPeekAtLastError());
366     }
367 
368     /* Send remote entries to their owner and overlap the communication with local computation */
369     if (size > 1) { ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); }
370     /* Add local entries to A and B */
371     if (Annz1) {
372       MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
373       CHKERRCUDA(cudaPeekAtLastError());
374     }
375     if (Bnnz1) {
376       MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
377       CHKERRCUDA(cudaPeekAtLastError());
378     }
379     if (size > 1) { ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); }
380 
381     /* Add received remote entries to A and B */
382     if (Annz2) {
383       MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
384       CHKERRCUDA(cudaPeekAtLastError());
385     }
386     if (Bnnz2) {
387       MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
388       CHKERRCUDA(cudaPeekAtLastError());
389     }
390 
391     if (imode == INSERT_VALUES) {
392       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);
393       if (size > 1) { ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); }
394     } else {
395       ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);
396       if (size > 1) { ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); }
397     }
398     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
399   } else {
400     ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr);
401   }
402   mat->offloadmask = PETSC_OFFLOAD_GPU;
403   PetscFunctionReturn(0);
404 }
405 
406 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
407 {
408   Mat            Ad,Ao;
409   const PetscInt *cmap;
410   PetscErrorCode ierr;
411 
412   PetscFunctionBegin;
413   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
414   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
415   if (glob) {
416     PetscInt cst, i, dn, on, *gidx;
417 
418     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
419     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
420     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
421     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
422     for (i=0; i<dn; i++) gidx[i]    = cst + i;
423     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
424     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
425   }
426   PetscFunctionReturn(0);
427 }
428 
429 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
430 {
431   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
432   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
433   PetscErrorCode     ierr;
434   PetscInt           i;
435 
436   PetscFunctionBegin;
437   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
438   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
439   if (PetscDefined(USE_DEBUG) && d_nnz) {
440     for (i=0; i<B->rmap->n; i++) {
441       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
442     }
443   }
444   if (PetscDefined(USE_DEBUG) && o_nnz) {
445     for (i=0; i<B->rmap->n; i++) {
446       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
447     }
448   }
449 #if defined(PETSC_USE_CTABLE)
450   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
451 #else
452   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
453 #endif
454   ierr = PetscFree(b->garray);CHKERRQ(ierr);
455   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
456   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
457   /* Because the B will have been resized we simply destroy it and create a new one each time */
458   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
459   if (!b->A) {
460     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
461     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
462     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
463   }
464   if (!b->B) {
465     PetscMPIInt size;
466     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
467     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
468     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
469     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
470   }
471   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
472   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
473   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
474   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
475   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
476   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
477   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
478   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
479   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
480   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
481   B->preallocated = PETSC_TRUE;
482   PetscFunctionReturn(0);
483 }
484 
485 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
486 {
487   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
488   PetscErrorCode ierr;
489 
490   PetscFunctionBegin;
491   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
492   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
493   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
494   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
499 {
500   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
501   PetscErrorCode ierr;
502 
503   PetscFunctionBegin;
504   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
505   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
506   PetscFunctionReturn(0);
507 }
508 
509 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
510 {
511   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
512   PetscErrorCode ierr;
513 
514   PetscFunctionBegin;
515   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
516   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
517   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
518   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
519   PetscFunctionReturn(0);
520 }
521 
522 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
523 {
524   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
525   PetscErrorCode ierr;
526 
527   PetscFunctionBegin;
528   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
529   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
530   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
531   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
532   PetscFunctionReturn(0);
533 }
534 
535 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
536 {
537   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
538   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
539 
540   PetscFunctionBegin;
541   switch (op) {
542   case MAT_CUSPARSE_MULT_DIAG:
543     cusparseStruct->diagGPUMatFormat = format;
544     break;
545   case MAT_CUSPARSE_MULT_OFFDIAG:
546     cusparseStruct->offdiagGPUMatFormat = format;
547     break;
548   case MAT_CUSPARSE_ALL:
549     cusparseStruct->diagGPUMatFormat    = format;
550     cusparseStruct->offdiagGPUMatFormat = format;
551     break;
552   default:
553     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
554   }
555   PetscFunctionReturn(0);
556 }
557 
558 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
559 {
560   MatCUSPARSEStorageFormat format;
561   PetscErrorCode           ierr;
562   PetscBool                flg;
563   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
564   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
565 
566   PetscFunctionBegin;
567   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
568   if (A->factortype==MAT_FACTOR_NONE) {
569     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
570                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
571     if (flg) {
572       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
573     }
574     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
575                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
576     if (flg) {
577       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
578     }
579     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
580                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
581     if (flg) {
582       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
583     }
584   }
585   ierr = PetscOptionsTail();CHKERRQ(ierr);
586   PetscFunctionReturn(0);
587 }
588 
589 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
590 {
591   PetscErrorCode     ierr;
592   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
593   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
594   PetscObjectState   onnz = A->nonzerostate;
595 
596   PetscFunctionBegin;
597   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
598   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
599   if (onnz != A->nonzerostate && cusp->deviceMat) {
600     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
601     cudaError_t                cerr;
602 
603     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
604     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
605     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
606     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
607     if (h_mat->allocated_indices) {
608       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
609       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
610       if (h_mat->offdiag.j) {
611         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
612         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
613       }
614     }
615     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
616     ierr = PetscFree(h_mat);CHKERRQ(ierr);
617     cusp->deviceMat = NULL;
618   }
619   PetscFunctionReturn(0);
620 }
621 
622 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
623 {
624   PetscErrorCode     ierr;
625   cudaError_t        cerr;
626   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
627   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
628   cusparseStatus_t   stat;
629 
630   PetscFunctionBegin;
631   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
632   if (cusparseStruct->deviceMat) {
633     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
634 
635     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
636     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
637     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
638     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
639     if (h_mat->allocated_indices) {
640       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
641       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
642       if (h_mat->offdiag.j) {
643         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
644         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
645       }
646     }
647     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
648     ierr = PetscFree(h_mat);CHKERRQ(ierr);
649   }
650   try {
651     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
652     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
653     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
654     /* We want cusparseStruct to use PetscDefaultCudaStream
655     if (cusparseStruct->stream) {
656       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
657     }
658     */
659     /* Free COO */
660     ierr = MatResetPreallocationCOO_MPIAIJCUSPARSE(A);CHKERRQ(ierr);
661     delete cusparseStruct;
662   } catch(char *ex) {
663     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
664   }
665   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
666   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
667   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
668   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
669   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
670   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
671   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
672   PetscFunctionReturn(0);
673 }
674 
675 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
676 {
677   PetscErrorCode     ierr;
678   Mat_MPIAIJ         *a;
679   cusparseStatus_t   stat;
680   Mat                A;
681 
682   PetscFunctionBegin;
683   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
684   if (reuse == MAT_INITIAL_MATRIX) {
685     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
686   } else if (reuse == MAT_REUSE_MATRIX) {
687     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
688   }
689   A = *newmat;
690   A->boundtocpu = PETSC_FALSE;
691   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
692   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
693 
694   a = (Mat_MPIAIJ*)A->data;
695   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
696   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
697   if (a->lvec) {
698     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
699   }
700 
701   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
702     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
703     stat     = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat);
704     a->spptr = cusp;
705   }
706 
707   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
708   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
709   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
710   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
711   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
712   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
713   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
714   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
715 
716   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
717   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
718   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
719   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
720   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
721   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
722 #if defined(PETSC_HAVE_HYPRE)
723   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
724 #endif
725   PetscFunctionReturn(0);
726 }
727 
728 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
729 {
730   PetscErrorCode ierr;
731 
732   PetscFunctionBegin;
733   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
734   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
735   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
736   PetscFunctionReturn(0);
737 }
738 
739 /*@
740    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
741    (the default parallel PETSc format).  This matrix will ultimately pushed down
742    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
743    assembly performance the user should preallocate the matrix storage by setting
744    the parameter nz (or the array nnz).  By setting these parameters accurately,
745    performance during matrix assembly can be increased by more than a factor of 50.
746 
747    Collective
748 
749    Input Parameters:
750 +  comm - MPI communicator, set to PETSC_COMM_SELF
751 .  m - number of rows
752 .  n - number of columns
753 .  nz - number of nonzeros per row (same for all rows)
754 -  nnz - array containing the number of nonzeros in the various rows
755          (possibly different for each row) or NULL
756 
757    Output Parameter:
758 .  A - the matrix
759 
760    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
761    MatXXXXSetPreallocation() paradigm instead of this routine directly.
762    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
763 
764    Notes:
765    If nnz is given then nz is ignored
766 
767    The AIJ format (also called the Yale sparse matrix format or
768    compressed row storage), is fully compatible with standard Fortran 77
769    storage.  That is, the stored row and column indices can begin at
770    either one (as in Fortran) or zero.  See the users' manual for details.
771 
772    Specify the preallocated storage with either nz or nnz (not both).
773    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
774    allocation.  For large problems you MUST preallocate memory or you
775    will get TERRIBLE performance, see the users' manual chapter on matrices.
776 
777    By default, this format uses inodes (identical nodes) when possible, to
778    improve numerical efficiency of matrix-vector products and solves. We
779    search for consecutive rows with the same nonzero structure, thereby
780    reusing matrix information to achieve increased efficiency.
781 
782    Level: intermediate
783 
784 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
785 @*/
786 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
787 {
788   PetscErrorCode ierr;
789   PetscMPIInt    size;
790 
791   PetscFunctionBegin;
792   ierr = MatCreate(comm,A);CHKERRQ(ierr);
793   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
794   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
795   if (size > 1) {
796     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
797     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
798   } else {
799     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
800     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
801   }
802   PetscFunctionReturn(0);
803 }
804 
805 /*MC
806    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
807 
808    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
809    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
810    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
811 
812    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
813    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
814    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
815    for communicators controlling multiple processes.  It is recommended that you call both of
816    the above preallocation routines for simplicity.
817 
818    Options Database Keys:
819 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
820 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
821 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
822 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
823 
824   Level: beginner
825 
826  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
827 M*/
828 
829 /*MC
830    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
831 
832   Level: beginner
833 
834  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
835 M*/
836 
837 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
838 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
839 {
840   PetscSplitCSRDataStructure d_mat;
841   PetscMPIInt                size;
842   PetscErrorCode             ierr;
843   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
844   PetscScalar                *aa = NULL,*ba = NULL;
845   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
846   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
847   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
848 
849   PetscFunctionBegin;
850   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
851   if (A->factortype != MAT_FACTOR_NONE) {
852     *B = NULL;
853     PetscFunctionReturn(0);
854   }
855   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
856   // get jaca
857   if (size == 1) {
858     PetscBool isseqaij;
859 
860     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
861     if (isseqaij) {
862       jaca = (Mat_SeqAIJ*)A->data;
863       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
864       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
865       d_mat = cusparsestructA->deviceMat;
866       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
867     } else {
868       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
869       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
870       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
871       jaca = (Mat_SeqAIJ*)aij->A->data;
872       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
873       d_mat = spptr->deviceMat;
874       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
875     }
876     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
877       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
878       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
879       matrixA = (CsrMatrix*)matstruct->mat;
880       bi = NULL;
881       bj = NULL;
882       ba = NULL;
883     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
884   } else {
885     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
886     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
887     jaca = (Mat_SeqAIJ*)aij->A->data;
888     jacb = (Mat_SeqAIJ*)aij->B->data;
889     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
890 
891     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
892     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
893     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
894     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
895     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
896       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
897       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
898       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
899       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
900       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
901       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
902       matrixA = (CsrMatrix*)matstructA->mat;
903       matrixB = (CsrMatrix*)matstructB->mat;
904       if (jacb->compressedrow.use) {
905         if (!cusparsestructB->rowoffsets_gpu) {
906           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
907           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
908         }
909         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
910       } else {
911         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
912       }
913       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
914       ba = thrust::raw_pointer_cast(matrixB->values->data());
915     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
916     d_mat = spptr->deviceMat;
917   }
918   if (jaca->compressedrow.use) {
919     if (!cusparsestructA->rowoffsets_gpu) {
920       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
921       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
922     }
923     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
924   } else {
925     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
926   }
927   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
928   aa = thrust::raw_pointer_cast(matrixA->values->data());
929 
930   if (!d_mat) {
931     cudaError_t                cerr;
932     PetscSplitCSRDataStructure h_mat;
933 
934     // create and populate strucy on host and copy on device
935     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
936     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
937     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
938     if (size > 1) { /* need the colmap array */
939       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
940       PetscInt   *colmap;
941       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
942 
943       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
944 
945       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
946       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
947 #if defined(PETSC_USE_64BIT_INDICES)
948       { // have to make a long version of these
949         int        *h_bi32, *h_bj32;
950         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
951         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
952         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
953         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
954         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
955         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
956 
957         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
958         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
959         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
960         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
961 
962         h_mat->offdiag.i = d_bi64;
963         h_mat->offdiag.j = d_bj64;
964         h_mat->allocated_indices = PETSC_TRUE;
965 
966         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
967       }
968 #else
969       h_mat->offdiag.i = (PetscInt*)bi;
970       h_mat->offdiag.j = (PetscInt*)bj;
971       h_mat->allocated_indices = PETSC_FALSE;
972 #endif
973       h_mat->offdiag.a = ba;
974       h_mat->offdiag.n = A->rmap->n;
975 
976       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
977       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
978       ierr = PetscFree(colmap);CHKERRQ(ierr);
979     }
980     h_mat->rstart = A->rmap->rstart;
981     h_mat->rend   = A->rmap->rend;
982     h_mat->cstart = A->cmap->rstart;
983     h_mat->cend   = A->cmap->rend;
984     h_mat->M      = A->cmap->N;
985 #if defined(PETSC_USE_64BIT_INDICES)
986     {
987       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
988       int        *h_ai32, *h_aj32;
989       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
990       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
991       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
992       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
993       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
994       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
995 
996       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
997       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
998       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
999       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1000 
1001       h_mat->diag.i = d_ai64;
1002       h_mat->diag.j = d_aj64;
1003       h_mat->allocated_indices = PETSC_TRUE;
1004 
1005       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
1006     }
1007 #else
1008     h_mat->diag.i = (PetscInt*)ai;
1009     h_mat->diag.j = (PetscInt*)aj;
1010     h_mat->allocated_indices = PETSC_FALSE;
1011 #endif
1012     h_mat->diag.a = aa;
1013     h_mat->diag.n = A->rmap->n;
1014     h_mat->rank   = PetscGlobalRank;
1015     // copy pointers and metadata to device
1016     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1017     ierr = PetscFree(h_mat);CHKERRQ(ierr);
1018   } else {
1019     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
1020   }
1021   *B = d_mat;
1022   A->offloadmask = PETSC_OFFLOAD_GPU;
1023   PetscFunctionReturn(0);
1024 }
1025