xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 3307d110e72ee4e6d2468971620073eb5ff93529) !
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25 {
26   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
27   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
28 
29   PetscFunctionBegin;
30   if (!cusparseStruct) PetscFunctionReturn(0);
31   if (cusparseStruct->use_extended_coo) {
32     PetscCallCUDA(cudaFree(cusparseStruct->Aimap1_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Bimap1_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
41     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
42     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
43     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
44     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
45     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
46     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
47   }
48   cusparseStruct->use_extended_coo = PETSC_FALSE;
49   delete cusparseStruct->coo_p;
50   delete cusparseStruct->coo_pw;
51   cusparseStruct->coo_p  = NULL;
52   cusparseStruct->coo_pw = NULL;
53   PetscFunctionReturn(0);
54 }
55 
56 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
57 {
58   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
59   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
60   PetscInt           n = cusp->coo_nd + cusp->coo_no;
61 
62   PetscFunctionBegin;
63   if (cusp->coo_p && v) {
64     thrust::device_ptr<const PetscScalar> d_v;
65     THRUSTARRAY                           *w = NULL;
66 
67     if (isCudaMem(v)) {
68       d_v = thrust::device_pointer_cast(v);
69     } else {
70       w = new THRUSTARRAY(n);
71       w->assign(v,v+n);
72       PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
73       d_v = w->data();
74     }
75 
76     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
77                                                               cusp->coo_pw->begin()));
78     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
79                                                               cusp->coo_pw->end()));
80     PetscCall(PetscLogGpuTimeBegin());
81     thrust::for_each(zibit,zieit,VecCUDAEquals());
82     PetscCall(PetscLogGpuTimeEnd());
83     delete w;
84     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode));
85     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode));
86   } else {
87     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode));
88     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode));
89   }
90   PetscFunctionReturn(0);
91 }
92 
93 template <typename Tuple>
94 struct IsNotOffDiagT
95 {
96   PetscInt _cstart,_cend;
97 
98   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
99   __host__ __device__
100   inline bool operator()(Tuple t)
101   {
102     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
103   }
104 };
105 
106 struct IsOffDiag
107 {
108   PetscInt _cstart,_cend;
109 
110   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
111   __host__ __device__
112   inline bool operator() (const PetscInt &c)
113   {
114     return c < _cstart || c >= _cend;
115   }
116 };
117 
118 struct GlobToLoc
119 {
120   PetscInt _start;
121 
122   GlobToLoc(PetscInt start) : _start(start) {}
123   __host__ __device__
124   inline PetscInt operator() (const PetscInt &c)
125   {
126     return c - _start;
127   }
128 };
129 
130 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
131 {
132   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
133   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
134   PetscInt               N,*jj;
135   size_t                 noff = 0;
136   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
137   THRUSTINTARRAY         d_j(n);
138   ISLocalToGlobalMapping l2g;
139 
140   PetscFunctionBegin;
141   PetscCall(MatDestroy(&b->A));
142   PetscCall(MatDestroy(&b->B));
143 
144   PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
145   d_i.assign(coo_i,coo_i+n);
146   d_j.assign(coo_j,coo_j+n);
147   PetscCall(PetscLogGpuTimeBegin());
148   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
149   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
150   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
151     cusp->coo_p = new THRUSTINTARRAY(n);
152     cusp->coo_pw = new THRUSTARRAY(n);
153     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
154     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
155     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
156     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
157     firstoffd = mzipp.get_iterator_tuple().get<1>();
158   }
159   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
160   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
161 
162   /* from global to local */
163   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
164   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
165   PetscCall(PetscLogGpuTimeEnd());
166 
167   /* copy offdiag column indices to map on the CPU */
168   PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */
169   PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost));
170   auto o_j = d_j.begin();
171   PetscCall(PetscLogGpuTimeBegin());
172   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
173   thrust::sort(thrust::device,o_j,d_j.end());
174   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
175   PetscCall(PetscLogGpuTimeEnd());
176   noff = thrust::distance(o_j,wit);
177   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
178   PetscCall(PetscMalloc1(noff,&b->garray));
179   PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost));
180   PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt)));
181   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g));
182   PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH));
183   PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj));
184   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
185   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
186 
187   PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
188   PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
189   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
190   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
191   PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
192   PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff));
193   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
194   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
195 
196   /* GPU memory, cusparse specific call handles it internally */
197   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get()));
198   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj));
199   PetscCall(PetscFree(jj));
200 
201   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat));
202   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat));
203 
204   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
205   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
206   PetscCall(MatSetUpMultiply_MPIAIJ(B));
207   PetscFunctionReturn(0);
208 }
209 
210 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
211 {
212   Mat_MPIAIJ         *mpiaij    = (Mat_MPIAIJ*)mat->data;
213   Mat_MPIAIJCUSPARSE *mpidev;
214   PetscBool           coo_basic = PETSC_TRUE;
215   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
216   PetscInt            rstart,rend;
217 
218   PetscFunctionBegin;
219   PetscCall(PetscFree(mpiaij->garray));
220   PetscCall(VecDestroy(&mpiaij->lvec));
221 #if defined(PETSC_USE_CTABLE)
222   PetscCall(PetscTableDestroy(&mpiaij->colmap));
223 #else
224   PetscCall(PetscFree(mpiaij->colmap));
225 #endif
226   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
227   mat->assembled                                                                      = PETSC_FALSE;
228   mat->was_assembled                                                                  = PETSC_FALSE;
229   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
230   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
231   if (coo_i) {
232     PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend));
233     PetscCall(PetscGetMemType(coo_i,&mtype));
234     if (PetscMemTypeHost(mtype)) {
235       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
236         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
237       }
238     }
239   }
240   /* All ranks must agree on the value of coo_basic */
241   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat)));
242   if (coo_basic) {
243     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
244   } else {
245     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j));
246     mat->offloadmask = PETSC_OFFLOAD_CPU;
247     /* creates the GPU memory */
248     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
249     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
250     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
251     mpidev->use_extended_coo = PETSC_TRUE;
252 
253     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount)));
254     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount)));
255     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount)));
256 
257     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount)));
258     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount)));
259     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount)));
260 
261     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount)));
262     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount)));
263     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount)));
264 
265     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount)));
266     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount)));
267     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount)));
268 
269     PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount)));
270     PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar)));
271     PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar)));
272 
273     PetscCallCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
274     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
275     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
276 
277     PetscCallCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
278     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
279     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
280 
281     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
282     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
283     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
284 
285     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
286     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
287     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
288 
289     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice));
290   }
291   PetscFunctionReturn(0);
292 }
293 
294 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
295 {
296   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
297   const PetscCount grid_size = gridDim.x * blockDim.x;
298   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
299 }
300 
301 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
302 {
303   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
304   const PetscCount grid_size  = gridDim.x * blockDim.x;
305   for (; i<nnz; i            += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
306 }
307 
308 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
309 {
310   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
311   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
312   Mat                 A      = mpiaij->A,B = mpiaij->B;
313   PetscCount          Annz1  = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
314   PetscScalar        *Aa,*Ba = NULL;
315   PetscScalar        *vsend  = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
316   const PetscScalar  *v1     = v;
317   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
318   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
319   const PetscCount   *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
320   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
321   PetscMemType        memtype;
322 
323   PetscFunctionBegin;
324   if (mpidev->use_extended_coo) {
325     PetscMPIInt size;
326 
327     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size));
328     PetscCall(PetscGetMemType(v,&memtype));
329     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
330       PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar)));
331       PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
332     }
333 
334     if (imode == INSERT_VALUES) {
335       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */
336       PetscCallCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar)));
337       if (size > 1) {
338         PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba));
339         PetscCallCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar)));
340       }
341     } else {
342       PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */
343       if (size > 1) PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba));
344     }
345 
346     /* Pack entries to be sent to remote */
347     if (mpiaij->sendlen) {
348       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
349       PetscCallCUDA(cudaPeekAtLastError());
350     }
351 
352     /* Send remote entries to their owner and overlap the communication with local computation */
353     if (size > 1) PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE));
354     /* Add local entries to A and B */
355     if (Annz1) {
356       MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
357       PetscCallCUDA(cudaPeekAtLastError());
358     }
359     if (Bnnz1) {
360       MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
361       PetscCallCUDA(cudaPeekAtLastError());
362     }
363     if (size > 1) PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE));
364 
365     /* Add received remote entries to A and B */
366     if (Annz2) {
367       MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
368       PetscCallCUDA(cudaPeekAtLastError());
369     }
370     if (Bnnz2) {
371       MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
372       PetscCallCUDA(cudaPeekAtLastError());
373     }
374 
375     if (imode == INSERT_VALUES) {
376       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
377       if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba));
378     } else {
379       PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
380       if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba));
381     }
382     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
383   } else {
384     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode));
385   }
386   mat->offloadmask = PETSC_OFFLOAD_GPU;
387   PetscFunctionReturn(0);
388 }
389 
390 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
391 {
392   Mat             Ad,Ao;
393   const PetscInt *cmap;
394 
395   PetscFunctionBegin;
396   PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap));
397   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc));
398   if (glob) {
399     PetscInt cst, i, dn, on, *gidx;
400 
401     PetscCall(MatGetLocalSize(Ad,NULL,&dn));
402     PetscCall(MatGetLocalSize(Ao,NULL,&on));
403     PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL));
404     PetscCall(PetscMalloc1(dn+on,&gidx));
405     for (i = 0; i<dn; i++) gidx[i]    = cst + i;
406     for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
407     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob));
408   }
409   PetscFunctionReturn(0);
410 }
411 
412 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
413 {
414   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
415   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
416   PetscInt           i;
417 
418   PetscFunctionBegin;
419   PetscCall(PetscLayoutSetUp(B->rmap));
420   PetscCall(PetscLayoutSetUp(B->cmap));
421   if (PetscDefined(USE_DEBUG) && d_nnz) {
422     for (i=0; i<B->rmap->n; i++) {
423       PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
424     }
425   }
426   if (PetscDefined(USE_DEBUG) && o_nnz) {
427     for (i=0; i<B->rmap->n; i++) {
428       PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
429     }
430   }
431 #if defined(PETSC_USE_CTABLE)
432   PetscCall(PetscTableDestroy(&b->colmap));
433 #else
434   PetscCall(PetscFree(b->colmap));
435 #endif
436   PetscCall(PetscFree(b->garray));
437   PetscCall(VecDestroy(&b->lvec));
438   PetscCall(VecScatterDestroy(&b->Mvctx));
439   /* Because the B will have been resized we simply destroy it and create a new one each time */
440   PetscCall(MatDestroy(&b->B));
441   if (!b->A) {
442     PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
443     PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
444     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
445   }
446   if (!b->B) {
447     PetscMPIInt size;
448     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
449     PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
450     PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0));
451     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
452   }
453   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
454   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
455   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
456   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
457   PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz));
458   PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz));
459   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat));
460   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat));
461   B->preallocated = PETSC_TRUE;
462   PetscFunctionReturn(0);
463 }
464 
465 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
466 {
467   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
468 
469   PetscFunctionBegin;
470   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
471   PetscCall((*a->A->ops->mult)(a->A,xx,yy));
472   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
473   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy));
474   PetscFunctionReturn(0);
475 }
476 
477 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
478 {
479   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
480 
481   PetscFunctionBegin;
482   PetscCall(MatZeroEntries(l->A));
483   PetscCall(MatZeroEntries(l->B));
484   PetscFunctionReturn(0);
485 }
486 
487 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
488 {
489   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
490 
491   PetscFunctionBegin;
492   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
493   PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz));
494   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
495   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz));
496   PetscFunctionReturn(0);
497 }
498 
499 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
500 {
501   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
502 
503   PetscFunctionBegin;
504   PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec));
505   PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy));
506   PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
507   PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
508   PetscFunctionReturn(0);
509 }
510 
511 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
512 {
513   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
514   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
515 
516   PetscFunctionBegin;
517   switch (op) {
518   case MAT_CUSPARSE_MULT_DIAG:
519     cusparseStruct->diagGPUMatFormat = format;
520     break;
521   case MAT_CUSPARSE_MULT_OFFDIAG:
522     cusparseStruct->offdiagGPUMatFormat = format;
523     break;
524   case MAT_CUSPARSE_ALL:
525     cusparseStruct->diagGPUMatFormat    = format;
526     cusparseStruct->offdiagGPUMatFormat = format;
527     break;
528   default:
529     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
530   }
531   PetscFunctionReturn(0);
532 }
533 
534 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
535 {
536   MatCUSPARSEStorageFormat format;
537   PetscBool                flg;
538   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
539   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
540 
541   PetscFunctionBegin;
542   PetscOptionsHeadBegin(PetscOptionsObject,"MPIAIJCUSPARSE options");
543   if (A->factortype==MAT_FACTOR_NONE) {
544     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
545                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
546     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format));
547     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
548                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
549     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format));
550     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
551                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
552     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
553   }
554   PetscOptionsHeadEnd();
555   PetscFunctionReturn(0);
556 }
557 
558 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
559 {
560   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
561   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
562   PetscObjectState   onnz = A->nonzerostate;
563 
564   PetscFunctionBegin;
565   PetscCall(MatAssemblyEnd_MPIAIJ(A,mode));
566   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA));
567   if (onnz != A->nonzerostate && cusp->deviceMat) {
568     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
569 
570     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
571     PetscCall(PetscNew(&h_mat));
572     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
573     PetscCallCUDA(cudaFree(h_mat->colmap));
574     if (h_mat->allocated_indices) {
575       PetscCallCUDA(cudaFree(h_mat->diag.i));
576       PetscCallCUDA(cudaFree(h_mat->diag.j));
577       if (h_mat->offdiag.j) {
578         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
579         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
580       }
581     }
582     PetscCallCUDA(cudaFree(d_mat));
583     PetscCall(PetscFree(h_mat));
584     cusp->deviceMat = NULL;
585   }
586   PetscFunctionReturn(0);
587 }
588 
589 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
590 {
591   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
592   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
593 
594   PetscFunctionBegin;
595   PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
596   if (cusparseStruct->deviceMat) {
597     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
598 
599     PetscCall(PetscInfo(A,"Have device matrix\n"));
600     PetscCall(PetscNew(&h_mat));
601     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
602     PetscCallCUDA(cudaFree(h_mat->colmap));
603     if (h_mat->allocated_indices) {
604       PetscCallCUDA(cudaFree(h_mat->diag.i));
605       PetscCallCUDA(cudaFree(h_mat->diag.j));
606       if (h_mat->offdiag.j) {
607         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
608         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
609       }
610     }
611     PetscCallCUDA(cudaFree(d_mat));
612     PetscCall(PetscFree(h_mat));
613   }
614   /* Free COO */
615   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
616   PetscCallCXX(delete cusparseStruct);
617   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL));
618   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL));
619   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
620   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
621   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
622   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL));
623   PetscCall(MatDestroy_MPIAIJ(A));
624   PetscFunctionReturn(0);
625 }
626 
627 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
628 {
629   Mat_MPIAIJ *a;
630   Mat         A;
631 
632   PetscFunctionBegin;
633   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
634   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat));
635   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN));
636   A = *newmat;
637   A->boundtocpu = PETSC_FALSE;
638   PetscCall(PetscFree(A->defaultvectype));
639   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
640 
641   a = (Mat_MPIAIJ*)A->data;
642   if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE));
643   if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE));
644   if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA));
645 
646   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
647     PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
648   }
649 
650   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
651   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
652   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
653   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
654   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
655   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
656   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
657   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
658 
659   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE));
660   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
661   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
662   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE));
663   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE));
664   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE));
665 #if defined(PETSC_HAVE_HYPRE)
666   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
667 #endif
668   PetscFunctionReturn(0);
669 }
670 
671 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
672 {
673   PetscFunctionBegin;
674   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
675   PetscCall(MatCreate_MPIAIJ(A));
676   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A));
677   PetscFunctionReturn(0);
678 }
679 
680 /*@
681    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
682    (the default parallel PETSc format).  This matrix will ultimately pushed down
683    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
684    assembly performance the user should preallocate the matrix storage by setting
685    the parameter nz (or the array nnz).  By setting these parameters accurately,
686    performance during matrix assembly can be increased by more than a factor of 50.
687 
688    Collective
689 
690    Input Parameters:
691 +  comm - MPI communicator, set to PETSC_COMM_SELF
692 .  m - number of rows
693 .  n - number of columns
694 .  nz - number of nonzeros per row (same for all rows)
695 -  nnz - array containing the number of nonzeros in the various rows
696          (possibly different for each row) or NULL
697 
698    Output Parameter:
699 .  A - the matrix
700 
701    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
702    MatXXXXSetPreallocation() paradigm instead of this routine directly.
703    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
704 
705    Notes:
706    If nnz is given then nz is ignored
707 
708    The AIJ format (also called the Yale sparse matrix format or
709    compressed row storage), is fully compatible with standard Fortran 77
710    storage.  That is, the stored row and column indices can begin at
711    either one (as in Fortran) or zero.  See the users' manual for details.
712 
713    Specify the preallocated storage with either nz or nnz (not both).
714    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
715    allocation.  For large problems you MUST preallocate memory or you
716    will get TERRIBLE performance, see the users' manual chapter on matrices.
717 
718    By default, this format uses inodes (identical nodes) when possible, to
719    improve numerical efficiency of matrix-vector products and solves. We
720    search for consecutive rows with the same nonzero structure, thereby
721    reusing matrix information to achieve increased efficiency.
722 
723    Level: intermediate
724 
725 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
726 @*/
727 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
728 {
729   PetscMPIInt size;
730 
731   PetscFunctionBegin;
732   PetscCall(MatCreate(comm,A));
733   PetscCall(MatSetSizes(*A,m,n,M,N));
734   PetscCallMPI(MPI_Comm_size(comm,&size));
735   if (size > 1) {
736     PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE));
737     PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz));
738   } else {
739     PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
740     PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz));
741   }
742   PetscFunctionReturn(0);
743 }
744 
745 /*MC
746    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
747 
748    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
749    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
750    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
751 
752    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
753    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
754    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
755    for communicators controlling multiple processes.  It is recommended that you call both of
756    the above preallocation routines for simplicity.
757 
758    Options Database Keys:
759 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
760 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
761 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
762 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
763 
764   Level: beginner
765 
766  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
767 M*/
768 
769 /*MC
770    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
771 
772   Level: beginner
773 
774  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
775 M*/
776 
777 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
778 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
779 {
780   PetscSplitCSRDataStructure d_mat;
781   PetscMPIInt                size;
782   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
783   PetscScalar                *aa = NULL,*ba = NULL;
784   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
785   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
786   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
787 
788   PetscFunctionBegin;
789   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
790   if (A->factortype != MAT_FACTOR_NONE) {
791     *B = NULL;
792     PetscFunctionReturn(0);
793   }
794   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
795   // get jaca
796   if (size == 1) {
797     PetscBool isseqaij;
798 
799     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij));
800     if (isseqaij) {
801       jaca = (Mat_SeqAIJ*)A->data;
802       PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
803       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
804       d_mat = cusparsestructA->deviceMat;
805       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
806     } else {
807       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
808       PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
809       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
810       jaca = (Mat_SeqAIJ*)aij->A->data;
811       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
812       d_mat = spptr->deviceMat;
813       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
814     }
815     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
816       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
817       PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
818       matrixA = (CsrMatrix*)matstruct->mat;
819       bi = NULL;
820       bj = NULL;
821       ba = NULL;
822     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
823   } else {
824     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
825     PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
826     jaca = (Mat_SeqAIJ*)aij->A->data;
827     jacb = (Mat_SeqAIJ*)aij->B->data;
828     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
829 
830     PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
831     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
832     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
833     PetscCheck(cusparsestructA->format==MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
834     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
835       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
836       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
837       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
838       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
839       PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
840       PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
841       matrixA = (CsrMatrix*)matstructA->mat;
842       matrixB = (CsrMatrix*)matstructB->mat;
843       if (jacb->compressedrow.use) {
844         if (!cusparsestructB->rowoffsets_gpu) {
845           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
846           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
847         }
848         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
849       } else {
850         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
851       }
852       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
853       ba = thrust::raw_pointer_cast(matrixB->values->data());
854     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
855     d_mat = spptr->deviceMat;
856   }
857   if (jaca->compressedrow.use) {
858     if (!cusparsestructA->rowoffsets_gpu) {
859       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
860       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
861     }
862     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
863   } else {
864     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
865   }
866   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
867   aa = thrust::raw_pointer_cast(matrixA->values->data());
868 
869   if (!d_mat) {
870     PetscSplitCSRDataStructure h_mat;
871 
872     // create and populate strucy on host and copy on device
873     PetscCall(PetscInfo(A,"Create device matrix\n"));
874     PetscCall(PetscNew(&h_mat));
875     PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat)));
876     if (size > 1) { /* need the colmap array */
877       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
878       PetscInt   *colmap;
879       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
880 
881       PetscCheck(!n || aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
882 
883       PetscCall(PetscCalloc1(N+1,&colmap));
884       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
885 #if defined(PETSC_USE_64BIT_INDICES)
886       { // have to make a long version of these
887         int        *h_bi32, *h_bj32;
888         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
889         PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64));
890         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost));
891         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
892         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost));
893         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
894 
895         PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64)));
896         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice));
897         PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64)));
898         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice));
899 
900         h_mat->offdiag.i = d_bi64;
901         h_mat->offdiag.j = d_bj64;
902         h_mat->allocated_indices = PETSC_TRUE;
903 
904         PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64));
905       }
906 #else
907       h_mat->offdiag.i = (PetscInt*)bi;
908       h_mat->offdiag.j = (PetscInt*)bj;
909       h_mat->allocated_indices = PETSC_FALSE;
910 #endif
911       h_mat->offdiag.a = ba;
912       h_mat->offdiag.n = A->rmap->n;
913 
914       PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap)));
915       PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice));
916       PetscCall(PetscFree(colmap));
917     }
918     h_mat->rstart = A->rmap->rstart;
919     h_mat->rend   = A->rmap->rend;
920     h_mat->cstart = A->cmap->rstart;
921     h_mat->cend   = A->cmap->rend;
922     h_mat->M      = A->cmap->N;
923 #if defined(PETSC_USE_64BIT_INDICES)
924     {
925       int        *h_ai32, *h_aj32;
926       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
927 
928       static_assert(sizeof(PetscInt) == 8,"");
929       PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64));
930       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost));
931       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
932       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost));
933       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
934 
935       PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64)));
936       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice));
937       PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64)));
938       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice));
939 
940       h_mat->diag.i = d_ai64;
941       h_mat->diag.j = d_aj64;
942       h_mat->allocated_indices = PETSC_TRUE;
943 
944       PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64));
945     }
946 #else
947     h_mat->diag.i = (PetscInt*)ai;
948     h_mat->diag.j = (PetscInt*)aj;
949     h_mat->allocated_indices = PETSC_FALSE;
950 #endif
951     h_mat->diag.a = aa;
952     h_mat->diag.n = A->rmap->n;
953     h_mat->rank   = PetscGlobalRank;
954     // copy pointers and metadata to device
955     PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice));
956     PetscCall(PetscFree(h_mat));
957   } else {
958     PetscCall(PetscInfo(A,"Reusing device matrix\n"));
959   }
960   *B = d_mat;
961   A->offloadmask = PETSC_OFFLOAD_GPU;
962   PetscFunctionReturn(0);
963 }
964