xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 89669be4d29968dc8d4c19ce1b69194a6a561ea4)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25 {
26   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
27   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
28 
29   PetscFunctionBegin;
30   if (!cusparseStruct) PetscFunctionReturn(0);
31   if (cusparseStruct->use_extended_coo) {
32     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
41     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
42     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
43     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
44     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
45   }
46   cusparseStruct->use_extended_coo = PETSC_FALSE;
47   delete cusparseStruct->coo_p;
48   delete cusparseStruct->coo_pw;
49   cusparseStruct->coo_p  = NULL;
50   cusparseStruct->coo_pw = NULL;
51   PetscFunctionReturn(0);
52 }
53 
54 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
55 {
56   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
57   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
58   PetscInt           n = cusp->coo_nd + cusp->coo_no;
59 
60   PetscFunctionBegin;
61   if (cusp->coo_p && v) {
62     thrust::device_ptr<const PetscScalar> d_v;
63     THRUSTARRAY                           *w = NULL;
64 
65     if (isCudaMem(v)) {
66       d_v = thrust::device_pointer_cast(v);
67     } else {
68       w = new THRUSTARRAY(n);
69       w->assign(v,v+n);
70       PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
71       d_v = w->data();
72     }
73 
74     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
75                                                               cusp->coo_pw->begin()));
76     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
77                                                               cusp->coo_pw->end()));
78     PetscCall(PetscLogGpuTimeBegin());
79     thrust::for_each(zibit,zieit,VecCUDAEquals());
80     PetscCall(PetscLogGpuTimeEnd());
81     delete w;
82     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode));
83     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode));
84   } else {
85     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode));
86     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode));
87   }
88   PetscFunctionReturn(0);
89 }
90 
91 template <typename Tuple>
92 struct IsNotOffDiagT
93 {
94   PetscInt _cstart,_cend;
95 
96   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
97   __host__ __device__
98   inline bool operator()(Tuple t)
99   {
100     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
101   }
102 };
103 
104 struct IsOffDiag
105 {
106   PetscInt _cstart,_cend;
107 
108   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
109   __host__ __device__
110   inline bool operator() (const PetscInt &c)
111   {
112     return c < _cstart || c >= _cend;
113   }
114 };
115 
116 struct GlobToLoc
117 {
118   PetscInt _start;
119 
120   GlobToLoc(PetscInt start) : _start(start) {}
121   __host__ __device__
122   inline PetscInt operator() (const PetscInt &c)
123   {
124     return c - _start;
125   }
126 };
127 
128 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
129 {
130   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
131   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
132   PetscInt               N,*jj;
133   size_t                 noff = 0;
134   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
135   THRUSTINTARRAY         d_j(n);
136   ISLocalToGlobalMapping l2g;
137 
138   PetscFunctionBegin;
139   PetscCall(MatDestroy(&b->A));
140   PetscCall(MatDestroy(&b->B));
141 
142   PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
143   d_i.assign(coo_i,coo_i+n);
144   d_j.assign(coo_j,coo_j+n);
145   PetscCall(PetscLogGpuTimeBegin());
146   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
147   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
148   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
149     cusp->coo_p = new THRUSTINTARRAY(n);
150     cusp->coo_pw = new THRUSTARRAY(n);
151     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
152     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
153     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
154     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
155     firstoffd = mzipp.get_iterator_tuple().get<1>();
156   }
157   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
158   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
159 
160   /* from global to local */
161   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
162   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
163   PetscCall(PetscLogGpuTimeEnd());
164 
165   /* copy offdiag column indices to map on the CPU */
166   PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */
167   PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost));
168   auto o_j = d_j.begin();
169   PetscCall(PetscLogGpuTimeBegin());
170   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
171   thrust::sort(thrust::device,o_j,d_j.end());
172   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
173   PetscCall(PetscLogGpuTimeEnd());
174   noff = thrust::distance(o_j,wit);
175   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
176   PetscCall(PetscMalloc1(noff,&b->garray));
177   PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost));
178   PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt)));
179   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g));
180   PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH));
181   PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj));
182   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
183   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
184 
185   PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
186   PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
187   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
188   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
189   PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
190   PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff));
191   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
192   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
193 
194   /* GPU memory, cusparse specific call handles it internally */
195   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get()));
196   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj));
197   PetscCall(PetscFree(jj));
198 
199   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat));
200   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat));
201 
202   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
203   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
204   PetscCall(MatSetUpMultiply_MPIAIJ(B));
205   PetscFunctionReturn(0);
206 }
207 
208 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
209 {
210   Mat_MPIAIJ         *mpiaij    = (Mat_MPIAIJ*)mat->data;
211   Mat_MPIAIJCUSPARSE *mpidev;
212   PetscBool           coo_basic = PETSC_TRUE;
213   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
214   PetscInt            rstart,rend;
215 
216   PetscFunctionBegin;
217   PetscCall(PetscFree(mpiaij->garray));
218   PetscCall(VecDestroy(&mpiaij->lvec));
219 #if defined(PETSC_USE_CTABLE)
220   PetscCall(PetscTableDestroy(&mpiaij->colmap));
221 #else
222   PetscCall(PetscFree(mpiaij->colmap));
223 #endif
224   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
225   mat->assembled                                                                      = PETSC_FALSE;
226   mat->was_assembled                                                                  = PETSC_FALSE;
227   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
228   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
229   if (coo_i) {
230     PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend));
231     PetscCall(PetscGetMemType(coo_i,&mtype));
232     if (PetscMemTypeHost(mtype)) {
233       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
234         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
235       }
236     }
237   }
238   /* All ranks must agree on the value of coo_basic */
239   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat)));
240   if (coo_basic) {
241     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
242   } else {
243     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j));
244     mat->offloadmask = PETSC_OFFLOAD_CPU;
245     /* creates the GPU memory */
246     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
247     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
248     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
249     mpidev->use_extended_coo = PETSC_TRUE;
250 
251     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz+1)*sizeof(PetscCount)));
252     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount)));
253 
254     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz+1)*sizeof(PetscCount)));
255     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount)));
256 
257     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount)));
258     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount)));
259     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount)));
260 
261     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount)));
262     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount)));
263     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount)));
264 
265     PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount)));
266     PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar)));
267     PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar)));
268 
269     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
270     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
271 
272     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
273     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
274 
275     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
276     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
277     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
278 
279     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
280     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
281     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
282 
283     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice));
284   }
285   PetscFunctionReturn(0);
286 }
287 
288 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
289 {
290   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
291   const PetscCount grid_size = gridDim.x * blockDim.x;
292   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
293 }
294 
295 __global__ void MatAddLocalCOOValues(const PetscScalar kv[],InsertMode imode,
296   PetscCount Annz,const PetscCount Ajmap1[],const PetscCount Aperm1[],PetscScalar Aa[],
297   PetscCount Bnnz,const PetscCount Bjmap1[],const PetscCount Bperm1[],PetscScalar Ba[])
298 {
299   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
300   const PetscCount grid_size  = gridDim.x * blockDim.x;
301   for (; i<Annz+Bnnz; i+= grid_size) {
302     PetscScalar sum = 0.0;
303     if (i<Annz) {
304       for (PetscCount k=Ajmap1[i]; k<Ajmap1[i+1]; k++) sum += kv[Aperm1[k]];
305       Aa[i] = (imode == INSERT_VALUES? 0.0 : Aa[i]) + sum;
306     } else {
307       i -= Annz;
308       for (PetscCount k=Bjmap1[i]; k<Bjmap1[i+1]; k++) sum += kv[Bperm1[k]];
309       Ba[i] = (imode == INSERT_VALUES? 0.0 : Ba[i]) + sum;
310     }
311   }
312 }
313 
314 __global__ void MatAddRemoteCOOValues(const PetscScalar kv[],
315   PetscCount Annz2,const PetscCount Aimap2[],const PetscCount Ajmap2[],const PetscCount Aperm2[],PetscScalar Aa[],
316   PetscCount Bnnz2,const PetscCount Bimap2[],const PetscCount Bjmap2[],const PetscCount Bperm2[],PetscScalar Ba[])
317 {
318   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
319   const PetscCount grid_size = gridDim.x * blockDim.x;
320   for (; i<Annz2+Bnnz2; i+= grid_size) {
321     if (i<Annz2) {
322       for (PetscCount k=Ajmap2[i]; k<Ajmap2[i+1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
323     } else {
324       i -= Annz2;
325       for (PetscCount k=Bjmap2[i]; k<Bjmap2[i+1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
326     }
327   }
328 }
329 
330 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
331 {
332   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
333   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
334   Mat                 A      = mpiaij->A,B = mpiaij->B;
335   PetscCount          Annz   = mpiaij->Annz,Annz2 = mpiaij->Annz2,Bnnz = mpiaij->Bnnz,Bnnz2 = mpiaij->Bnnz2;
336   PetscScalar        *Aa,*Ba = NULL;
337   PetscScalar        *vsend  = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
338   const PetscScalar  *v1     = v;
339   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap2 = mpidev->Aimap2_d;
340   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap2 = mpidev->Bimap2_d;
341   const PetscCount   *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
342   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
343   PetscMemType        memtype;
344 
345   PetscFunctionBegin;
346   if (mpidev->use_extended_coo) {
347     PetscMPIInt size;
348 
349     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size));
350     PetscCall(PetscGetMemType(v,&memtype));
351     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
352       PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar)));
353       PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
354     }
355 
356     if (imode == INSERT_VALUES) {
357       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */
358       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba));
359     } else {
360       PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */
361       PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba));
362     }
363 
364     /* Pack entries to be sent to remote */
365     if (mpiaij->sendlen) {
366       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);PetscCallCUDA(cudaPeekAtLastError());
367     }
368 
369     /* Send remote entries to their owner and overlap the communication with local computation */
370     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE));
371     /* Add local entries to A and B */
372     if (Annz+Bnnz > 0) {
373       MatAddLocalCOOValues<<<(Annz+Bnnz+255)/256,256>>>(v1,imode,Annz,Ajmap1,Aperm1,Aa,Bnnz,Bjmap1,Bperm1,Ba);
374       PetscCallCUDA(cudaPeekAtLastError());
375     }
376     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE));
377 
378     /* Add received remote entries to A and B */
379     if (Annz2+Bnnz2 > 0) {
380       MatAddRemoteCOOValues<<<(Annz2+Bnnz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
381       PetscCallCUDA(cudaPeekAtLastError());
382     }
383 
384     if (imode == INSERT_VALUES) {
385       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
386       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba));
387     } else {
388       PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
389       PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba));
390     }
391     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
392   } else {
393     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode));
394   }
395   mat->offloadmask = PETSC_OFFLOAD_GPU;
396   PetscFunctionReturn(0);
397 }
398 
399 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
400 {
401   Mat             Ad,Ao;
402   const PetscInt *cmap;
403 
404   PetscFunctionBegin;
405   PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap));
406   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc));
407   if (glob) {
408     PetscInt cst, i, dn, on, *gidx;
409 
410     PetscCall(MatGetLocalSize(Ad,NULL,&dn));
411     PetscCall(MatGetLocalSize(Ao,NULL,&on));
412     PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL));
413     PetscCall(PetscMalloc1(dn+on,&gidx));
414     for (i = 0; i<dn; i++) gidx[i]    = cst + i;
415     for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
416     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob));
417   }
418   PetscFunctionReturn(0);
419 }
420 
421 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
422 {
423   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
424   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
425   PetscInt           i;
426 
427   PetscFunctionBegin;
428   PetscCall(PetscLayoutSetUp(B->rmap));
429   PetscCall(PetscLayoutSetUp(B->cmap));
430   if (PetscDefined(USE_DEBUG) && d_nnz) {
431     for (i=0; i<B->rmap->n; i++) {
432       PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
433     }
434   }
435   if (PetscDefined(USE_DEBUG) && o_nnz) {
436     for (i=0; i<B->rmap->n; i++) {
437       PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
438     }
439   }
440 #if defined(PETSC_USE_CTABLE)
441   PetscCall(PetscTableDestroy(&b->colmap));
442 #else
443   PetscCall(PetscFree(b->colmap));
444 #endif
445   PetscCall(PetscFree(b->garray));
446   PetscCall(VecDestroy(&b->lvec));
447   PetscCall(VecScatterDestroy(&b->Mvctx));
448   /* Because the B will have been resized we simply destroy it and create a new one each time */
449   PetscCall(MatDestroy(&b->B));
450   if (!b->A) {
451     PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
452     PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
453     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
454   }
455   if (!b->B) {
456     PetscMPIInt size;
457     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
458     PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
459     PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0));
460     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
461   }
462   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
463   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
464   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
465   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
466   PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz));
467   PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz));
468   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat));
469   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat));
470   B->preallocated = PETSC_TRUE;
471   PetscFunctionReturn(0);
472 }
473 
474 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
475 {
476   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
477 
478   PetscFunctionBegin;
479   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
480   PetscCall((*a->A->ops->mult)(a->A,xx,yy));
481   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
482   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy));
483   PetscFunctionReturn(0);
484 }
485 
486 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
487 {
488   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
489 
490   PetscFunctionBegin;
491   PetscCall(MatZeroEntries(l->A));
492   PetscCall(MatZeroEntries(l->B));
493   PetscFunctionReturn(0);
494 }
495 
496 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
497 {
498   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
499 
500   PetscFunctionBegin;
501   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
502   PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz));
503   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
504   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz));
505   PetscFunctionReturn(0);
506 }
507 
508 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
509 {
510   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
511 
512   PetscFunctionBegin;
513   PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec));
514   PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy));
515   PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
516   PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
517   PetscFunctionReturn(0);
518 }
519 
520 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
521 {
522   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
523   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
524 
525   PetscFunctionBegin;
526   switch (op) {
527   case MAT_CUSPARSE_MULT_DIAG:
528     cusparseStruct->diagGPUMatFormat = format;
529     break;
530   case MAT_CUSPARSE_MULT_OFFDIAG:
531     cusparseStruct->offdiagGPUMatFormat = format;
532     break;
533   case MAT_CUSPARSE_ALL:
534     cusparseStruct->diagGPUMatFormat    = format;
535     cusparseStruct->offdiagGPUMatFormat = format;
536     break;
537   default:
538     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
539   }
540   PetscFunctionReturn(0);
541 }
542 
543 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
544 {
545   MatCUSPARSEStorageFormat format;
546   PetscBool                flg;
547   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
548   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
549 
550   PetscFunctionBegin;
551   PetscOptionsHeadBegin(PetscOptionsObject,"MPIAIJCUSPARSE options");
552   if (A->factortype==MAT_FACTOR_NONE) {
553     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
554                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
555     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format));
556     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
557                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
558     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format));
559     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
560                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
561     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
562   }
563   PetscOptionsHeadEnd();
564   PetscFunctionReturn(0);
565 }
566 
567 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
568 {
569   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
570   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
571   PetscObjectState   onnz = A->nonzerostate;
572 
573   PetscFunctionBegin;
574   PetscCall(MatAssemblyEnd_MPIAIJ(A,mode));
575   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA));
576   if (onnz != A->nonzerostate && cusp->deviceMat) {
577     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
578 
579     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
580     PetscCall(PetscNew(&h_mat));
581     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
582     PetscCallCUDA(cudaFree(h_mat->colmap));
583     if (h_mat->allocated_indices) {
584       PetscCallCUDA(cudaFree(h_mat->diag.i));
585       PetscCallCUDA(cudaFree(h_mat->diag.j));
586       if (h_mat->offdiag.j) {
587         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
588         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
589       }
590     }
591     PetscCallCUDA(cudaFree(d_mat));
592     PetscCall(PetscFree(h_mat));
593     cusp->deviceMat = NULL;
594   }
595   PetscFunctionReturn(0);
596 }
597 
598 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
599 {
600   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
601   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
602 
603   PetscFunctionBegin;
604   PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
605   if (cusparseStruct->deviceMat) {
606     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
607 
608     PetscCall(PetscInfo(A,"Have device matrix\n"));
609     PetscCall(PetscNew(&h_mat));
610     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
611     PetscCallCUDA(cudaFree(h_mat->colmap));
612     if (h_mat->allocated_indices) {
613       PetscCallCUDA(cudaFree(h_mat->diag.i));
614       PetscCallCUDA(cudaFree(h_mat->diag.j));
615       if (h_mat->offdiag.j) {
616         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
617         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
618       }
619     }
620     PetscCallCUDA(cudaFree(d_mat));
621     PetscCall(PetscFree(h_mat));
622   }
623   /* Free COO */
624   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
625   PetscCallCXX(delete cusparseStruct);
626   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL));
627   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL));
628   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
629   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
630   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
631   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL));
632   PetscCall(MatDestroy_MPIAIJ(A));
633   PetscFunctionReturn(0);
634 }
635 
636 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
637 {
638   Mat_MPIAIJ *a;
639   Mat         A;
640 
641   PetscFunctionBegin;
642   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
643   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat));
644   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN));
645   A = *newmat;
646   A->boundtocpu = PETSC_FALSE;
647   PetscCall(PetscFree(A->defaultvectype));
648   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
649 
650   a = (Mat_MPIAIJ*)A->data;
651   if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE));
652   if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE));
653   if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA));
654 
655   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
656     PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
657   }
658 
659   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
660   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
661   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
662   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
663   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
664   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
665   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
666   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
667 
668   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE));
669   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
670   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
671   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE));
672   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE));
673   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE));
674 #if defined(PETSC_HAVE_HYPRE)
675   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
676 #endif
677   PetscFunctionReturn(0);
678 }
679 
680 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
681 {
682   PetscFunctionBegin;
683   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
684   PetscCall(MatCreate_MPIAIJ(A));
685   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A));
686   PetscFunctionReturn(0);
687 }
688 
689 /*@
690    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
691    (the default parallel PETSc format).  This matrix will ultimately pushed down
692    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
693    assembly performance the user should preallocate the matrix storage by setting
694    the parameter nz (or the array nnz).  By setting these parameters accurately,
695    performance during matrix assembly can be increased by more than a factor of 50.
696 
697    Collective
698 
699    Input Parameters:
700 +  comm - MPI communicator, set to PETSC_COMM_SELF
701 .  m - number of rows
702 .  n - number of columns
703 .  nz - number of nonzeros per row (same for all rows)
704 -  nnz - array containing the number of nonzeros in the various rows
705          (possibly different for each row) or NULL
706 
707    Output Parameter:
708 .  A - the matrix
709 
710    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
711    MatXXXXSetPreallocation() paradigm instead of this routine directly.
712    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
713 
714    Notes:
715    If nnz is given then nz is ignored
716 
717    The AIJ format (also called the Yale sparse matrix format or
718    compressed row storage), is fully compatible with standard Fortran 77
719    storage.  That is, the stored row and column indices can begin at
720    either one (as in Fortran) or zero.  See the users' manual for details.
721 
722    Specify the preallocated storage with either nz or nnz (not both).
723    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
724    allocation.  For large problems you MUST preallocate memory or you
725    will get TERRIBLE performance, see the users' manual chapter on matrices.
726 
727    By default, this format uses inodes (identical nodes) when possible, to
728    improve numerical efficiency of matrix-vector products and solves. We
729    search for consecutive rows with the same nonzero structure, thereby
730    reusing matrix information to achieve increased efficiency.
731 
732    Level: intermediate
733 
734 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
735 @*/
736 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
737 {
738   PetscMPIInt size;
739 
740   PetscFunctionBegin;
741   PetscCall(MatCreate(comm,A));
742   PetscCall(MatSetSizes(*A,m,n,M,N));
743   PetscCallMPI(MPI_Comm_size(comm,&size));
744   if (size > 1) {
745     PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE));
746     PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz));
747   } else {
748     PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
749     PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz));
750   }
751   PetscFunctionReturn(0);
752 }
753 
754 /*MC
755    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
756 
757    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
758    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
759    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
760 
761    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
762    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
763    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
764    for communicators controlling multiple processes.  It is recommended that you call both of
765    the above preallocation routines for simplicity.
766 
767    Options Database Keys:
768 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
769 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
770 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
771 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
772 
773   Level: beginner
774 
775  .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
776 M*/
777 
778 /*MC
779    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
780 
781   Level: beginner
782 
783  .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
784 M*/
785 
786 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
787 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
788 {
789   PetscSplitCSRDataStructure d_mat;
790   PetscMPIInt                size;
791   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
792   PetscScalar                *aa = NULL,*ba = NULL;
793   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
794   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
795   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
796 
797   PetscFunctionBegin;
798   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
799   if (A->factortype != MAT_FACTOR_NONE) {
800     *B = NULL;
801     PetscFunctionReturn(0);
802   }
803   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
804   // get jaca
805   if (size == 1) {
806     PetscBool isseqaij;
807 
808     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij));
809     if (isseqaij) {
810       jaca = (Mat_SeqAIJ*)A->data;
811       PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
812       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
813       d_mat = cusparsestructA->deviceMat;
814       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
815     } else {
816       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
817       PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
818       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
819       jaca = (Mat_SeqAIJ*)aij->A->data;
820       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
821       d_mat = spptr->deviceMat;
822       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
823     }
824     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
825       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
826       PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
827       matrixA = (CsrMatrix*)matstruct->mat;
828       bi = NULL;
829       bj = NULL;
830       ba = NULL;
831     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
832   } else {
833     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
834     PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
835     jaca = (Mat_SeqAIJ*)aij->A->data;
836     jacb = (Mat_SeqAIJ*)aij->B->data;
837     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
838 
839     PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
840     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
841     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
842     PetscCheck(cusparsestructA->format==MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
843     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
844       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
845       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
846       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
847       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
848       PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
849       PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
850       matrixA = (CsrMatrix*)matstructA->mat;
851       matrixB = (CsrMatrix*)matstructB->mat;
852       if (jacb->compressedrow.use) {
853         if (!cusparsestructB->rowoffsets_gpu) {
854           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
855           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
856         }
857         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
858       } else {
859         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
860       }
861       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
862       ba = thrust::raw_pointer_cast(matrixB->values->data());
863     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
864     d_mat = spptr->deviceMat;
865   }
866   if (jaca->compressedrow.use) {
867     if (!cusparsestructA->rowoffsets_gpu) {
868       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
869       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
870     }
871     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
872   } else {
873     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
874   }
875   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
876   aa = thrust::raw_pointer_cast(matrixA->values->data());
877 
878   if (!d_mat) {
879     PetscSplitCSRDataStructure h_mat;
880 
881     // create and populate strucy on host and copy on device
882     PetscCall(PetscInfo(A,"Create device matrix\n"));
883     PetscCall(PetscNew(&h_mat));
884     PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat)));
885     if (size > 1) { /* need the colmap array */
886       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
887       PetscInt   *colmap;
888       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
889 
890       PetscCheck(!n || aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
891 
892       PetscCall(PetscCalloc1(N+1,&colmap));
893       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
894 #if defined(PETSC_USE_64BIT_INDICES)
895       { // have to make a long version of these
896         int        *h_bi32, *h_bj32;
897         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
898         PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64));
899         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost));
900         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
901         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost));
902         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
903 
904         PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64)));
905         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice));
906         PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64)));
907         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice));
908 
909         h_mat->offdiag.i = d_bi64;
910         h_mat->offdiag.j = d_bj64;
911         h_mat->allocated_indices = PETSC_TRUE;
912 
913         PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64));
914       }
915 #else
916       h_mat->offdiag.i = (PetscInt*)bi;
917       h_mat->offdiag.j = (PetscInt*)bj;
918       h_mat->allocated_indices = PETSC_FALSE;
919 #endif
920       h_mat->offdiag.a = ba;
921       h_mat->offdiag.n = A->rmap->n;
922 
923       PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap)));
924       PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice));
925       PetscCall(PetscFree(colmap));
926     }
927     h_mat->rstart = A->rmap->rstart;
928     h_mat->rend   = A->rmap->rend;
929     h_mat->cstart = A->cmap->rstart;
930     h_mat->cend   = A->cmap->rend;
931     h_mat->M      = A->cmap->N;
932 #if defined(PETSC_USE_64BIT_INDICES)
933     {
934       int        *h_ai32, *h_aj32;
935       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
936 
937       static_assert(sizeof(PetscInt) == 8,"");
938       PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64));
939       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost));
940       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
941       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost));
942       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
943 
944       PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64)));
945       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice));
946       PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64)));
947       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice));
948 
949       h_mat->diag.i = d_ai64;
950       h_mat->diag.j = d_aj64;
951       h_mat->allocated_indices = PETSC_TRUE;
952 
953       PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64));
954     }
955 #else
956     h_mat->diag.i = (PetscInt*)ai;
957     h_mat->diag.j = (PetscInt*)aj;
958     h_mat->allocated_indices = PETSC_FALSE;
959 #endif
960     h_mat->diag.a = aa;
961     h_mat->diag.n = A->rmap->n;
962     h_mat->rank   = PetscGlobalRank;
963     // copy pointers and metadata to device
964     PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice));
965     PetscCall(PetscFree(h_mat));
966   } else {
967     PetscCall(PetscInfo(A,"Reusing device matrix\n"));
968   }
969   *B = d_mat;
970   A->offloadmask = PETSC_OFFLOAD_GPU;
971   PetscFunctionReturn(0);
972 }
973