xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision b122ec5aa1bd4469eb4e0673542fb7de3f411254)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25 {
26   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
27   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
28 
29   PetscFunctionBegin;
30   if (!cusparseStruct) PetscFunctionReturn(0);
31   if (cusparseStruct->use_extended_coo) {
32     CHKERRCUDA(cudaFree(cusparseStruct->Aimap1_d));
33     CHKERRCUDA(cudaFree(cusparseStruct->Ajmap1_d));
34     CHKERRCUDA(cudaFree(cusparseStruct->Aperm1_d));
35     CHKERRCUDA(cudaFree(cusparseStruct->Bimap1_d));
36     CHKERRCUDA(cudaFree(cusparseStruct->Bjmap1_d));
37     CHKERRCUDA(cudaFree(cusparseStruct->Bperm1_d));
38     CHKERRCUDA(cudaFree(cusparseStruct->Aimap2_d));
39     CHKERRCUDA(cudaFree(cusparseStruct->Ajmap2_d));
40     CHKERRCUDA(cudaFree(cusparseStruct->Aperm2_d));
41     CHKERRCUDA(cudaFree(cusparseStruct->Bimap2_d));
42     CHKERRCUDA(cudaFree(cusparseStruct->Bjmap2_d));
43     CHKERRCUDA(cudaFree(cusparseStruct->Bperm2_d));
44     CHKERRCUDA(cudaFree(cusparseStruct->Cperm1_d));
45     CHKERRCUDA(cudaFree(cusparseStruct->sendbuf_d));
46     CHKERRCUDA(cudaFree(cusparseStruct->recvbuf_d));
47   }
48   cusparseStruct->use_extended_coo = PETSC_FALSE;
49   delete cusparseStruct->coo_p;
50   delete cusparseStruct->coo_pw;
51   cusparseStruct->coo_p  = NULL;
52   cusparseStruct->coo_pw = NULL;
53   PetscFunctionReturn(0);
54 }
55 
56 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
57 {
58   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
59   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
60   PetscInt           n = cusp->coo_nd + cusp->coo_no;
61 
62   PetscFunctionBegin;
63   if (cusp->coo_p && v) {
64     thrust::device_ptr<const PetscScalar> d_v;
65     THRUSTARRAY                           *w = NULL;
66 
67     if (isCudaMem(v)) {
68       d_v = thrust::device_pointer_cast(v);
69     } else {
70       w = new THRUSTARRAY(n);
71       w->assign(v,v+n);
72       CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
73       d_v = w->data();
74     }
75 
76     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
77                                                               cusp->coo_pw->begin()));
78     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
79                                                               cusp->coo_pw->end()));
80     CHKERRQ(PetscLogGpuTimeBegin());
81     thrust::for_each(zibit,zieit,VecCUDAEquals());
82     CHKERRQ(PetscLogGpuTimeEnd());
83     delete w;
84     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode));
85     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode));
86   } else {
87     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode));
88     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode));
89   }
90   PetscFunctionReturn(0);
91 }
92 
93 template <typename Tuple>
94 struct IsNotOffDiagT
95 {
96   PetscInt _cstart,_cend;
97 
98   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
99   __host__ __device__
100   inline bool operator()(Tuple t)
101   {
102     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
103   }
104 };
105 
106 struct IsOffDiag
107 {
108   PetscInt _cstart,_cend;
109 
110   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
111   __host__ __device__
112   inline bool operator() (const PetscInt &c)
113   {
114     return c < _cstart || c >= _cend;
115   }
116 };
117 
118 struct GlobToLoc
119 {
120   PetscInt _start;
121 
122   GlobToLoc(PetscInt start) : _start(start) {}
123   __host__ __device__
124   inline PetscInt operator() (const PetscInt &c)
125   {
126     return c - _start;
127   }
128 };
129 
130 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
131 {
132   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
133   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
134   PetscInt               N,*jj;
135   size_t                 noff = 0;
136   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
137   THRUSTINTARRAY         d_j(n);
138   ISLocalToGlobalMapping l2g;
139 
140   PetscFunctionBegin;
141   if (b->A) CHKERRQ(MatCUSPARSEClearHandle(b->A));
142   if (b->B) CHKERRQ(MatCUSPARSEClearHandle(b->B));
143   CHKERRQ(MatDestroy(&b->A));
144   CHKERRQ(MatDestroy(&b->B));
145 
146   CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
147   d_i.assign(coo_i,coo_i+n);
148   d_j.assign(coo_j,coo_j+n);
149   CHKERRQ(PetscLogGpuTimeBegin());
150   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
151   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
152   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
153     cusp->coo_p = new THRUSTINTARRAY(n);
154     cusp->coo_pw = new THRUSTARRAY(n);
155     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
156     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
157     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
158     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
159     firstoffd = mzipp.get_iterator_tuple().get<1>();
160   }
161   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
162   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
163 
164   /* from global to local */
165   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
166   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
167   CHKERRQ(PetscLogGpuTimeEnd());
168 
169   /* copy offdiag column indices to map on the CPU */
170   CHKERRQ(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */
171   CHKERRCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost));
172   auto o_j = d_j.begin();
173   CHKERRQ(PetscLogGpuTimeBegin());
174   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
175   thrust::sort(thrust::device,o_j,d_j.end());
176   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
177   CHKERRQ(PetscLogGpuTimeEnd());
178   noff = thrust::distance(o_j,wit);
179   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
180   CHKERRQ(PetscMalloc1(noff,&b->garray));
181   CHKERRCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost));
182   CHKERRQ(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt)));
183   CHKERRQ(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g));
184   CHKERRQ(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH));
185   CHKERRQ(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj));
186   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
187   CHKERRQ(ISLocalToGlobalMappingDestroy(&l2g));
188 
189   CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->A));
190   CHKERRQ(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
191   CHKERRQ(MatSetType(b->A,MATSEQAIJCUSPARSE));
192   CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
193   CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->B));
194   CHKERRQ(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff));
195   CHKERRQ(MatSetType(b->B,MATSEQAIJCUSPARSE));
196   CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
197 
198   /* GPU memory, cusparse specific call handles it internally */
199   CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get()));
200   CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj));
201   CHKERRQ(PetscFree(jj));
202 
203   CHKERRQ(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat));
204   CHKERRQ(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat));
205   CHKERRQ(MatCUSPARSESetHandle(b->A,cusp->handle));
206   CHKERRQ(MatCUSPARSESetHandle(b->B,cusp->handle));
207   /*
208   CHKERRQ(MatCUSPARSESetStream(b->A,cusp->stream));
209   CHKERRQ(MatCUSPARSESetStream(b->B,cusp->stream));
210   */
211 
212   CHKERRQ(MatBindToCPU(b->A,B->boundtocpu));
213   CHKERRQ(MatBindToCPU(b->B,B->boundtocpu));
214   CHKERRQ(MatSetUpMultiply_MPIAIJ(B));
215   PetscFunctionReturn(0);
216 }
217 
218 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
219 {
220   Mat_MPIAIJ         *mpiaij    = (Mat_MPIAIJ*)mat->data;
221   Mat_MPIAIJCUSPARSE *mpidev;
222   PetscBool           coo_basic = PETSC_TRUE;
223   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
224   PetscInt            rstart,rend;
225 
226   PetscFunctionBegin;
227   CHKERRQ(PetscFree(mpiaij->garray));
228   CHKERRQ(VecDestroy(&mpiaij->lvec));
229 #if defined(PETSC_USE_CTABLE)
230   CHKERRQ(PetscTableDestroy(&mpiaij->colmap));
231 #else
232   CHKERRQ(PetscFree(mpiaij->colmap));
233 #endif
234   CHKERRQ(VecScatterDestroy(&mpiaij->Mvctx));
235   mat->assembled                                                                      = PETSC_FALSE;
236   mat->was_assembled                                                                  = PETSC_FALSE;
237   CHKERRQ(MatResetPreallocationCOO_MPIAIJ(mat));
238   CHKERRQ(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
239   if (coo_i) {
240     CHKERRQ(PetscLayoutGetRange(mat->rmap,&rstart,&rend));
241     CHKERRQ(PetscGetMemType(coo_i,&mtype));
242     if (PetscMemTypeHost(mtype)) {
243       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
244         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
245       }
246     }
247   }
248   /* All ranks must agree on the value of coo_basic */
249   CHKERRMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat)));
250   if (coo_basic) {
251     CHKERRQ(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
252   } else {
253     CHKERRQ(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j));
254     mat->offloadmask = PETSC_OFFLOAD_CPU;
255     /* creates the GPU memory */
256     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
257     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
258     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
259     mpidev->use_extended_coo = PETSC_TRUE;
260 
261     CHKERRCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount)));
262     CHKERRCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount)));
263     CHKERRCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount)));
264 
265     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount)));
266     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount)));
267     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount)));
268 
269     CHKERRCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount)));
270     CHKERRCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount)));
271     CHKERRCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount)));
272 
273     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount)));
274     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount)));
275     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount)));
276 
277     CHKERRCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount)));
278     CHKERRCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar)));
279     CHKERRCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar)));
280 
281     CHKERRCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
282     CHKERRCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
283     CHKERRCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
284 
285     CHKERRCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
286     CHKERRCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
287     CHKERRCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
288 
289     CHKERRCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
290     CHKERRCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
291     CHKERRCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
292 
293     CHKERRCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
294     CHKERRCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
295     CHKERRCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
296 
297     CHKERRCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice));
298   }
299   PetscFunctionReturn(0);
300 }
301 
302 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
303 {
304   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
305   const PetscCount grid_size = gridDim.x * blockDim.x;
306   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
307 }
308 
309 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
310 {
311   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
312   const PetscCount grid_size  = gridDim.x * blockDim.x;
313   for (; i<nnz; i            += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
314 }
315 
316 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
317 {
318   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
319   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
320   Mat                 A      = mpiaij->A,B = mpiaij->B;
321   PetscCount          Annz1  = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
322   PetscScalar        *Aa,*Ba = NULL;
323   PetscScalar        *vsend  = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
324   const PetscScalar  *v1     = v;
325   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
326   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
327   const PetscCount   *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
328   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
329   PetscMemType        memtype;
330 
331   PetscFunctionBegin;
332   if (mpidev->use_extended_coo) {
333     PetscMPIInt size;
334 
335     CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size));
336     CHKERRQ(PetscGetMemType(v,&memtype));
337     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
338       CHKERRCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar)));
339       CHKERRCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
340     }
341 
342     if (imode == INSERT_VALUES) {
343       CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */
344       CHKERRCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar)));
345       if (size > 1) {
346         CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba));
347         CHKERRCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar)));
348       }
349     } else {
350       CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */
351       if (size > 1) CHKERRQ(MatSeqAIJCUSPARSEGetArray(B,&Ba));
352     }
353 
354     /* Pack entries to be sent to remote */
355     if (mpiaij->sendlen) {
356       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
357       CHKERRCUDA(cudaPeekAtLastError());
358     }
359 
360     /* Send remote entries to their owner and overlap the communication with local computation */
361     if (size > 1) CHKERRQ(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE));
362     /* Add local entries to A and B */
363     if (Annz1) {
364       MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
365       CHKERRCUDA(cudaPeekAtLastError());
366     }
367     if (Bnnz1) {
368       MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
369       CHKERRCUDA(cudaPeekAtLastError());
370     }
371     if (size > 1) CHKERRQ(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE));
372 
373     /* Add received remote entries to A and B */
374     if (Annz2) {
375       MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
376       CHKERRCUDA(cudaPeekAtLastError());
377     }
378     if (Bnnz2) {
379       MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
380       CHKERRCUDA(cudaPeekAtLastError());
381     }
382 
383     if (imode == INSERT_VALUES) {
384       CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
385       if (size > 1) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba));
386     } else {
387       CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
388       if (size > 1) CHKERRQ(MatSeqAIJCUSPARSERestoreArray(B,&Ba));
389     }
390     if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1));
391   } else {
392     CHKERRQ(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode));
393   }
394   mat->offloadmask = PETSC_OFFLOAD_GPU;
395   PetscFunctionReturn(0);
396 }
397 
398 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
399 {
400   Mat             Ad,Ao;
401   const PetscInt *cmap;
402 
403   PetscFunctionBegin;
404   CHKERRQ(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap));
405   CHKERRQ(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc));
406   if (glob) {
407     PetscInt cst, i, dn, on, *gidx;
408 
409     CHKERRQ(MatGetLocalSize(Ad,NULL,&dn));
410     CHKERRQ(MatGetLocalSize(Ao,NULL,&on));
411     CHKERRQ(MatGetOwnershipRangeColumn(A,&cst,NULL));
412     CHKERRQ(PetscMalloc1(dn+on,&gidx));
413     for (i = 0; i<dn; i++) gidx[i]    = cst + i;
414     for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
415     CHKERRQ(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob));
416   }
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
421 {
422   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
423   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
424   PetscInt           i;
425 
426   PetscFunctionBegin;
427   CHKERRQ(PetscLayoutSetUp(B->rmap));
428   CHKERRQ(PetscLayoutSetUp(B->cmap));
429   if (PetscDefined(USE_DEBUG) && d_nnz) {
430     for (i=0; i<B->rmap->n; i++) {
431       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
432     }
433   }
434   if (PetscDefined(USE_DEBUG) && o_nnz) {
435     for (i=0; i<B->rmap->n; i++) {
436       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
437     }
438   }
439 #if defined(PETSC_USE_CTABLE)
440   CHKERRQ(PetscTableDestroy(&b->colmap));
441 #else
442   CHKERRQ(PetscFree(b->colmap));
443 #endif
444   CHKERRQ(PetscFree(b->garray));
445   CHKERRQ(VecDestroy(&b->lvec));
446   CHKERRQ(VecScatterDestroy(&b->Mvctx));
447   /* Because the B will have been resized we simply destroy it and create a new one each time */
448   CHKERRQ(MatDestroy(&b->B));
449   if (!b->A) {
450     CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->A));
451     CHKERRQ(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
452     CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
453   }
454   if (!b->B) {
455     PetscMPIInt size;
456     CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
457     CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->B));
458     CHKERRQ(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0));
459     CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
460   }
461   CHKERRQ(MatSetType(b->A,MATSEQAIJCUSPARSE));
462   CHKERRQ(MatSetType(b->B,MATSEQAIJCUSPARSE));
463   CHKERRQ(MatBindToCPU(b->A,B->boundtocpu));
464   CHKERRQ(MatBindToCPU(b->B,B->boundtocpu));
465   CHKERRQ(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz));
466   CHKERRQ(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz));
467   CHKERRQ(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat));
468   CHKERRQ(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat));
469   CHKERRQ(MatCUSPARSESetHandle(b->A,cusparseStruct->handle));
470   CHKERRQ(MatCUSPARSESetHandle(b->B,cusparseStruct->handle));
471   B->preallocated = PETSC_TRUE;
472   PetscFunctionReturn(0);
473 }
474 
475 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
476 {
477   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
478 
479   PetscFunctionBegin;
480   CHKERRQ(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
481   CHKERRQ((*a->A->ops->mult)(a->A,xx,yy));
482   CHKERRQ(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
483   CHKERRQ((*a->B->ops->multadd)(a->B,a->lvec,yy,yy));
484   PetscFunctionReturn(0);
485 }
486 
487 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
488 {
489   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
490 
491   PetscFunctionBegin;
492   CHKERRQ(MatZeroEntries(l->A));
493   CHKERRQ(MatZeroEntries(l->B));
494   PetscFunctionReturn(0);
495 }
496 
497 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
498 {
499   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
500 
501   PetscFunctionBegin;
502   CHKERRQ(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
503   CHKERRQ((*a->A->ops->multadd)(a->A,xx,yy,zz));
504   CHKERRQ(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
505   CHKERRQ((*a->B->ops->multadd)(a->B,a->lvec,zz,zz));
506   PetscFunctionReturn(0);
507 }
508 
509 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
510 {
511   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
512 
513   PetscFunctionBegin;
514   CHKERRQ((*a->B->ops->multtranspose)(a->B,xx,a->lvec));
515   CHKERRQ((*a->A->ops->multtranspose)(a->A,xx,yy));
516   CHKERRQ(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
517   CHKERRQ(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
518   PetscFunctionReturn(0);
519 }
520 
521 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
522 {
523   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
524   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
525 
526   PetscFunctionBegin;
527   switch (op) {
528   case MAT_CUSPARSE_MULT_DIAG:
529     cusparseStruct->diagGPUMatFormat = format;
530     break;
531   case MAT_CUSPARSE_MULT_OFFDIAG:
532     cusparseStruct->offdiagGPUMatFormat = format;
533     break;
534   case MAT_CUSPARSE_ALL:
535     cusparseStruct->diagGPUMatFormat    = format;
536     cusparseStruct->offdiagGPUMatFormat = format;
537     break;
538   default:
539     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
540   }
541   PetscFunctionReturn(0);
542 }
543 
544 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
545 {
546   MatCUSPARSEStorageFormat format;
547   PetscBool                flg;
548   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
549   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
550 
551   PetscFunctionBegin;
552   CHKERRQ(PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options"));
553   if (A->factortype==MAT_FACTOR_NONE) {
554     CHKERRQ(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
555                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
556     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format));
557     CHKERRQ(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
558                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
559     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format));
560     CHKERRQ(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
561                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
562     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
563   }
564   CHKERRQ(PetscOptionsTail());
565   PetscFunctionReturn(0);
566 }
567 
568 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
569 {
570   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
571   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
572   PetscObjectState   onnz = A->nonzerostate;
573 
574   PetscFunctionBegin;
575   CHKERRQ(MatAssemblyEnd_MPIAIJ(A,mode));
576   if (mpiaij->lvec) CHKERRQ(VecSetType(mpiaij->lvec,VECSEQCUDA));
577   if (onnz != A->nonzerostate && cusp->deviceMat) {
578     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
579 
580     CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
581     CHKERRQ(PetscNew(&h_mat));
582     CHKERRCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
583     CHKERRCUDA(cudaFree(h_mat->colmap));
584     if (h_mat->allocated_indices) {
585       CHKERRCUDA(cudaFree(h_mat->diag.i));
586       CHKERRCUDA(cudaFree(h_mat->diag.j));
587       if (h_mat->offdiag.j) {
588         CHKERRCUDA(cudaFree(h_mat->offdiag.i));
589         CHKERRCUDA(cudaFree(h_mat->offdiag.j));
590       }
591     }
592     CHKERRCUDA(cudaFree(d_mat));
593     CHKERRQ(PetscFree(h_mat));
594     cusp->deviceMat = NULL;
595   }
596   PetscFunctionReturn(0);
597 }
598 
599 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
600 {
601   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
602   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
603 
604   PetscFunctionBegin;
605   PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
606   if (cusparseStruct->deviceMat) {
607     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
608 
609     CHKERRQ(PetscInfo(A,"Have device matrix\n"));
610     CHKERRQ(PetscNew(&h_mat));
611     CHKERRCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
612     CHKERRCUDA(cudaFree(h_mat->colmap));
613     if (h_mat->allocated_indices) {
614       CHKERRCUDA(cudaFree(h_mat->diag.i));
615       CHKERRCUDA(cudaFree(h_mat->diag.j));
616       if (h_mat->offdiag.j) {
617         CHKERRCUDA(cudaFree(h_mat->offdiag.i));
618         CHKERRCUDA(cudaFree(h_mat->offdiag.j));
619       }
620     }
621     CHKERRCUDA(cudaFree(d_mat));
622     CHKERRQ(PetscFree(h_mat));
623   }
624   try {
625     if (aij->A) CHKERRQ(MatCUSPARSEClearHandle(aij->A));
626     if (aij->B) CHKERRQ(MatCUSPARSEClearHandle(aij->B));
627     CHKERRCUSPARSE(cusparseDestroy(cusparseStruct->handle));
628     /* We want cusparseStruct to use PetscDefaultCudaStream
629     if (cusparseStruct->stream) {
630       CHKERRCUDA(cudaStreamDestroy(cusparseStruct->stream));
631     }
632     */
633     /* Free COO */
634     CHKERRQ(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
635     delete cusparseStruct;
636   } catch(char *ex) {
637     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
638   }
639   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL));
640   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL));
641   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
642   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
643   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
644   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL));
645   CHKERRQ(MatDestroy_MPIAIJ(A));
646   PetscFunctionReturn(0);
647 }
648 
649 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
650 {
651   Mat_MPIAIJ *a;
652   Mat         A;
653 
654   PetscFunctionBegin;
655   CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
656   if (reuse == MAT_INITIAL_MATRIX) CHKERRQ(MatDuplicate(B,MAT_COPY_VALUES,newmat));
657   else if (reuse == MAT_REUSE_MATRIX) CHKERRQ(MatCopy(B,*newmat,SAME_NONZERO_PATTERN));
658   A = *newmat;
659   A->boundtocpu = PETSC_FALSE;
660   CHKERRQ(PetscFree(A->defaultvectype));
661   CHKERRQ(PetscStrallocpy(VECCUDA,&A->defaultvectype));
662 
663   a = (Mat_MPIAIJ*)A->data;
664   if (a->A) CHKERRQ(MatSetType(a->A,MATSEQAIJCUSPARSE));
665   if (a->B) CHKERRQ(MatSetType(a->B,MATSEQAIJCUSPARSE));
666   if (a->lvec) CHKERRQ(VecSetType(a->lvec,VECSEQCUDA));
667 
668   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
669     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
670     CHKERRCUSPARSE(cusparseCreate(&(cusp->handle)));
671     a->spptr = cusp;
672   }
673 
674   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
675   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
676   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
677   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
678   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
679   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
680   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
681   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
682 
683   CHKERRQ(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE));
684   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
685   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
686   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE));
687   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE));
688   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE));
689 #if defined(PETSC_HAVE_HYPRE)
690   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
691 #endif
692   PetscFunctionReturn(0);
693 }
694 
695 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
696 {
697   PetscFunctionBegin;
698   CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
699   CHKERRQ(MatCreate_MPIAIJ(A));
700   CHKERRQ(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A));
701   PetscFunctionReturn(0);
702 }
703 
704 /*@
705    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
706    (the default parallel PETSc format).  This matrix will ultimately pushed down
707    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
708    assembly performance the user should preallocate the matrix storage by setting
709    the parameter nz (or the array nnz).  By setting these parameters accurately,
710    performance during matrix assembly can be increased by more than a factor of 50.
711 
712    Collective
713 
714    Input Parameters:
715 +  comm - MPI communicator, set to PETSC_COMM_SELF
716 .  m - number of rows
717 .  n - number of columns
718 .  nz - number of nonzeros per row (same for all rows)
719 -  nnz - array containing the number of nonzeros in the various rows
720          (possibly different for each row) or NULL
721 
722    Output Parameter:
723 .  A - the matrix
724 
725    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
726    MatXXXXSetPreallocation() paradigm instead of this routine directly.
727    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
728 
729    Notes:
730    If nnz is given then nz is ignored
731 
732    The AIJ format (also called the Yale sparse matrix format or
733    compressed row storage), is fully compatible with standard Fortran 77
734    storage.  That is, the stored row and column indices can begin at
735    either one (as in Fortran) or zero.  See the users' manual for details.
736 
737    Specify the preallocated storage with either nz or nnz (not both).
738    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
739    allocation.  For large problems you MUST preallocate memory or you
740    will get TERRIBLE performance, see the users' manual chapter on matrices.
741 
742    By default, this format uses inodes (identical nodes) when possible, to
743    improve numerical efficiency of matrix-vector products and solves. We
744    search for consecutive rows with the same nonzero structure, thereby
745    reusing matrix information to achieve increased efficiency.
746 
747    Level: intermediate
748 
749 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
750 @*/
751 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
752 {
753   PetscMPIInt size;
754 
755   PetscFunctionBegin;
756   CHKERRQ(MatCreate(comm,A));
757   CHKERRQ(MatSetSizes(*A,m,n,M,N));
758   CHKERRMPI(MPI_Comm_size(comm,&size));
759   if (size > 1) {
760     CHKERRQ(MatSetType(*A,MATMPIAIJCUSPARSE));
761     CHKERRQ(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz));
762   } else {
763     CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE));
764     CHKERRQ(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz));
765   }
766   PetscFunctionReturn(0);
767 }
768 
769 /*MC
770    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
771 
772    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
773    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
774    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
775 
776    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
777    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
778    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
779    for communicators controlling multiple processes.  It is recommended that you call both of
780    the above preallocation routines for simplicity.
781 
782    Options Database Keys:
783 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
784 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
785 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
786 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
787 
788   Level: beginner
789 
790  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
791 M*/
792 
793 /*MC
794    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
795 
796   Level: beginner
797 
798  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
799 M*/
800 
801 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
802 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
803 {
804   PetscSplitCSRDataStructure d_mat;
805   PetscMPIInt                size;
806   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
807   PetscScalar                *aa = NULL,*ba = NULL;
808   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
809   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
810   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
811 
812   PetscFunctionBegin;
813   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
814   if (A->factortype != MAT_FACTOR_NONE) {
815     *B = NULL;
816     PetscFunctionReturn(0);
817   }
818   CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
819   // get jaca
820   if (size == 1) {
821     PetscBool isseqaij;
822 
823     CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij));
824     if (isseqaij) {
825       jaca = (Mat_SeqAIJ*)A->data;
826       PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
827       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
828       d_mat = cusparsestructA->deviceMat;
829       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
830     } else {
831       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
832       PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
833       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
834       jaca = (Mat_SeqAIJ*)aij->A->data;
835       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
836       d_mat = spptr->deviceMat;
837       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->A));
838     }
839     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
840       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
841       PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
842       matrixA = (CsrMatrix*)matstruct->mat;
843       bi = NULL;
844       bj = NULL;
845       ba = NULL;
846     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
847   } else {
848     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
849     PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
850     jaca = (Mat_SeqAIJ*)aij->A->data;
851     jacb = (Mat_SeqAIJ*)aij->B->data;
852     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
853 
854     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
855     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
856     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
857     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
858     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
859       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->A));
860       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->B));
861       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
862       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
863       PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
864       PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
865       matrixA = (CsrMatrix*)matstructA->mat;
866       matrixB = (CsrMatrix*)matstructB->mat;
867       if (jacb->compressedrow.use) {
868         if (!cusparsestructB->rowoffsets_gpu) {
869           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
870           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
871         }
872         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
873       } else {
874         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
875       }
876       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
877       ba = thrust::raw_pointer_cast(matrixB->values->data());
878     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
879     d_mat = spptr->deviceMat;
880   }
881   if (jaca->compressedrow.use) {
882     if (!cusparsestructA->rowoffsets_gpu) {
883       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
884       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
885     }
886     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
887   } else {
888     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
889   }
890   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
891   aa = thrust::raw_pointer_cast(matrixA->values->data());
892 
893   if (!d_mat) {
894     PetscSplitCSRDataStructure h_mat;
895 
896     // create and populate strucy on host and copy on device
897     CHKERRQ(PetscInfo(A,"Create device matrix\n"));
898     CHKERRQ(PetscNew(&h_mat));
899     CHKERRCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat)));
900     if (size > 1) { /* need the colmap array */
901       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
902       PetscInt   *colmap;
903       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
904 
905       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
906 
907       CHKERRQ(PetscCalloc1(N+1,&colmap));
908       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
909 #if defined(PETSC_USE_64BIT_INDICES)
910       { // have to make a long version of these
911         int        *h_bi32, *h_bj32;
912         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
913         CHKERRQ(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64));
914         CHKERRCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost));
915         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
916         CHKERRCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost));
917         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
918 
919         CHKERRCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64)));
920         CHKERRCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice));
921         CHKERRCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64)));
922         CHKERRCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice));
923 
924         h_mat->offdiag.i = d_bi64;
925         h_mat->offdiag.j = d_bj64;
926         h_mat->allocated_indices = PETSC_TRUE;
927 
928         CHKERRQ(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64));
929       }
930 #else
931       h_mat->offdiag.i = (PetscInt*)bi;
932       h_mat->offdiag.j = (PetscInt*)bj;
933       h_mat->allocated_indices = PETSC_FALSE;
934 #endif
935       h_mat->offdiag.a = ba;
936       h_mat->offdiag.n = A->rmap->n;
937 
938       CHKERRCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap)));
939       CHKERRCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice));
940       CHKERRQ(PetscFree(colmap));
941     }
942     h_mat->rstart = A->rmap->rstart;
943     h_mat->rend   = A->rmap->rend;
944     h_mat->cstart = A->cmap->rstart;
945     h_mat->cend   = A->cmap->rend;
946     h_mat->M      = A->cmap->N;
947 #if defined(PETSC_USE_64BIT_INDICES)
948     {
949       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
950       int        *h_ai32, *h_aj32;
951       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
952       CHKERRQ(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64));
953       CHKERRCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost));
954       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
955       CHKERRCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost));
956       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
957 
958       CHKERRCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64)));
959       CHKERRCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice));
960       CHKERRCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64)));
961       CHKERRCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice));
962 
963       h_mat->diag.i = d_ai64;
964       h_mat->diag.j = d_aj64;
965       h_mat->allocated_indices = PETSC_TRUE;
966 
967       CHKERRQ(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64));
968     }
969 #else
970     h_mat->diag.i = (PetscInt*)ai;
971     h_mat->diag.j = (PetscInt*)aj;
972     h_mat->allocated_indices = PETSC_FALSE;
973 #endif
974     h_mat->diag.a = aa;
975     h_mat->diag.n = A->rmap->n;
976     h_mat->rank   = PetscGlobalRank;
977     // copy pointers and metadata to device
978     CHKERRCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice));
979     CHKERRQ(PetscFree(h_mat));
980   } else {
981     CHKERRQ(PetscInfo(A,"Reusing device matrix\n"));
982   }
983   *B = d_mat;
984   A->offloadmask = PETSC_OFFLOAD_GPU;
985   PetscFunctionReturn(0);
986 }
987