xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision ffa8c5705e8ab2cf85ee1d14dbe507a6e2eb5283)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals
15 {
16   template <typename Tuple>
17   __host__ __device__
18   void operator()(Tuple t)
19   {
20     thrust::get<1>(t) = thrust::get<0>(t);
21   }
22 };
23 
24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25 {
26   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
27   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
28 
29   PetscFunctionBegin;
30   if (!cusparseStruct) PetscFunctionReturn(0);
31   if (cusparseStruct->use_extended_coo) {
32     PetscCallCUDA(cudaFree(cusparseStruct->Aimap1_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Bimap1_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
41     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
42     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
43     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
44     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
45     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
46     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
47   }
48   cusparseStruct->use_extended_coo = PETSC_FALSE;
49   delete cusparseStruct->coo_p;
50   delete cusparseStruct->coo_pw;
51   cusparseStruct->coo_p  = NULL;
52   cusparseStruct->coo_pw = NULL;
53   PetscFunctionReturn(0);
54 }
55 
56 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
57 {
58   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
59   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
60   PetscInt           n = cusp->coo_nd + cusp->coo_no;
61 
62   PetscFunctionBegin;
63   if (cusp->coo_p && v) {
64     thrust::device_ptr<const PetscScalar> d_v;
65     THRUSTARRAY                           *w = NULL;
66 
67     if (isCudaMem(v)) {
68       d_v = thrust::device_pointer_cast(v);
69     } else {
70       w = new THRUSTARRAY(n);
71       w->assign(v,v+n);
72       PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
73       d_v = w->data();
74     }
75 
76     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
77                                                               cusp->coo_pw->begin()));
78     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
79                                                               cusp->coo_pw->end()));
80     PetscCall(PetscLogGpuTimeBegin());
81     thrust::for_each(zibit,zieit,VecCUDAEquals());
82     PetscCall(PetscLogGpuTimeEnd());
83     delete w;
84     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode));
85     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode));
86   } else {
87     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode));
88     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode));
89   }
90   PetscFunctionReturn(0);
91 }
92 
93 template <typename Tuple>
94 struct IsNotOffDiagT
95 {
96   PetscInt _cstart,_cend;
97 
98   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
99   __host__ __device__
100   inline bool operator()(Tuple t)
101   {
102     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
103   }
104 };
105 
106 struct IsOffDiag
107 {
108   PetscInt _cstart,_cend;
109 
110   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
111   __host__ __device__
112   inline bool operator() (const PetscInt &c)
113   {
114     return c < _cstart || c >= _cend;
115   }
116 };
117 
118 struct GlobToLoc
119 {
120   PetscInt _start;
121 
122   GlobToLoc(PetscInt start) : _start(start) {}
123   __host__ __device__
124   inline PetscInt operator() (const PetscInt &c)
125   {
126     return c - _start;
127   }
128 };
129 
130 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
131 {
132   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
133   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
134   PetscInt               N,*jj;
135   size_t                 noff = 0;
136   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
137   THRUSTINTARRAY         d_j(n);
138   ISLocalToGlobalMapping l2g;
139 
140   PetscFunctionBegin;
141   if (b->A) PetscCall(MatCUSPARSEClearHandle(b->A));
142   if (b->B) PetscCall(MatCUSPARSEClearHandle(b->B));
143   PetscCall(MatDestroy(&b->A));
144   PetscCall(MatDestroy(&b->B));
145 
146   PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
147   d_i.assign(coo_i,coo_i+n);
148   d_j.assign(coo_j,coo_j+n);
149   PetscCall(PetscLogGpuTimeBegin());
150   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
151   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
152   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
153     cusp->coo_p = new THRUSTINTARRAY(n);
154     cusp->coo_pw = new THRUSTARRAY(n);
155     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
156     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
157     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
158     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
159     firstoffd = mzipp.get_iterator_tuple().get<1>();
160   }
161   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
162   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
163 
164   /* from global to local */
165   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
166   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
167   PetscCall(PetscLogGpuTimeEnd());
168 
169   /* copy offdiag column indices to map on the CPU */
170   PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */
171   PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost));
172   auto o_j = d_j.begin();
173   PetscCall(PetscLogGpuTimeBegin());
174   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
175   thrust::sort(thrust::device,o_j,d_j.end());
176   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
177   PetscCall(PetscLogGpuTimeEnd());
178   noff = thrust::distance(o_j,wit);
179   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
180   PetscCall(PetscMalloc1(noff,&b->garray));
181   PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost));
182   PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt)));
183   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g));
184   PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH));
185   PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj));
186   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
187   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
188 
189   PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
190   PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
191   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
192   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
193   PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
194   PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff));
195   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
196   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
197 
198   /* GPU memory, cusparse specific call handles it internally */
199   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get()));
200   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj));
201   PetscCall(PetscFree(jj));
202 
203   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat));
204   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat));
205   PetscCall(MatCUSPARSESetHandle(b->A,cusp->handle));
206   PetscCall(MatCUSPARSESetHandle(b->B,cusp->handle));
207   /*
208   PetscCall(MatCUSPARSESetStream(b->A,cusp->stream));
209   PetscCall(MatCUSPARSESetStream(b->B,cusp->stream));
210   */
211 
212   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
213   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
214   PetscCall(MatSetUpMultiply_MPIAIJ(B));
215   PetscFunctionReturn(0);
216 }
217 
218 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
219 {
220   Mat_MPIAIJ         *mpiaij    = (Mat_MPIAIJ*)mat->data;
221   Mat_MPIAIJCUSPARSE *mpidev;
222   PetscBool           coo_basic = PETSC_TRUE;
223   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
224   PetscInt            rstart,rend;
225 
226   PetscFunctionBegin;
227   PetscCall(PetscFree(mpiaij->garray));
228   PetscCall(VecDestroy(&mpiaij->lvec));
229 #if defined(PETSC_USE_CTABLE)
230   PetscCall(PetscTableDestroy(&mpiaij->colmap));
231 #else
232   PetscCall(PetscFree(mpiaij->colmap));
233 #endif
234   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
235   mat->assembled                                                                      = PETSC_FALSE;
236   mat->was_assembled                                                                  = PETSC_FALSE;
237   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
238   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
239   if (coo_i) {
240     PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend));
241     PetscCall(PetscGetMemType(coo_i,&mtype));
242     if (PetscMemTypeHost(mtype)) {
243       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
244         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
245       }
246     }
247   }
248   /* All ranks must agree on the value of coo_basic */
249   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat)));
250   if (coo_basic) {
251     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
252   } else {
253     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j));
254     mat->offloadmask = PETSC_OFFLOAD_CPU;
255     /* creates the GPU memory */
256     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
257     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
258     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
259     mpidev->use_extended_coo = PETSC_TRUE;
260 
261     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount)));
262     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount)));
263     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount)));
264 
265     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount)));
266     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount)));
267     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount)));
268 
269     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount)));
270     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount)));
271     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount)));
272 
273     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount)));
274     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount)));
275     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount)));
276 
277     PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount)));
278     PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar)));
279     PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar)));
280 
281     PetscCallCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
282     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
283     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
284 
285     PetscCallCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
286     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
287     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
288 
289     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
290     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
291     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
292 
293     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
294     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
295     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
296 
297     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice));
298   }
299   PetscFunctionReturn(0);
300 }
301 
302 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
303 {
304   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
305   const PetscCount grid_size = gridDim.x * blockDim.x;
306   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
307 }
308 
309 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
310 {
311   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
312   const PetscCount grid_size  = gridDim.x * blockDim.x;
313   for (; i<nnz; i            += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
314 }
315 
316 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
317 {
318   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
319   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
320   Mat                 A      = mpiaij->A,B = mpiaij->B;
321   PetscCount          Annz1  = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
322   PetscScalar        *Aa,*Ba = NULL;
323   PetscScalar        *vsend  = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
324   const PetscScalar  *v1     = v;
325   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
326   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
327   const PetscCount   *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
328   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
329   PetscMemType        memtype;
330 
331   PetscFunctionBegin;
332   if (mpidev->use_extended_coo) {
333     PetscMPIInt size;
334 
335     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size));
336     PetscCall(PetscGetMemType(v,&memtype));
337     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
338       PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar)));
339       PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
340     }
341 
342     if (imode == INSERT_VALUES) {
343       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */
344       PetscCallCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar)));
345       if (size > 1) {
346         PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba));
347         PetscCallCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar)));
348       }
349     } else {
350       PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */
351       if (size > 1) PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba));
352     }
353 
354     /* Pack entries to be sent to remote */
355     if (mpiaij->sendlen) {
356       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
357       PetscCallCUDA(cudaPeekAtLastError());
358     }
359 
360     /* Send remote entries to their owner and overlap the communication with local computation */
361     if (size > 1) PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE));
362     /* Add local entries to A and B */
363     if (Annz1) {
364       MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
365       PetscCallCUDA(cudaPeekAtLastError());
366     }
367     if (Bnnz1) {
368       MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
369       PetscCallCUDA(cudaPeekAtLastError());
370     }
371     if (size > 1) PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE));
372 
373     /* Add received remote entries to A and B */
374     if (Annz2) {
375       MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
376       PetscCallCUDA(cudaPeekAtLastError());
377     }
378     if (Bnnz2) {
379       MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
380       PetscCallCUDA(cudaPeekAtLastError());
381     }
382 
383     if (imode == INSERT_VALUES) {
384       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
385       if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba));
386     } else {
387       PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
388       if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba));
389     }
390     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
391   } else {
392     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode));
393   }
394   mat->offloadmask = PETSC_OFFLOAD_GPU;
395   PetscFunctionReturn(0);
396 }
397 
398 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
399 {
400   Mat             Ad,Ao;
401   const PetscInt *cmap;
402 
403   PetscFunctionBegin;
404   PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap));
405   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc));
406   if (glob) {
407     PetscInt cst, i, dn, on, *gidx;
408 
409     PetscCall(MatGetLocalSize(Ad,NULL,&dn));
410     PetscCall(MatGetLocalSize(Ao,NULL,&on));
411     PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL));
412     PetscCall(PetscMalloc1(dn+on,&gidx));
413     for (i = 0; i<dn; i++) gidx[i]    = cst + i;
414     for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
415     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob));
416   }
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
421 {
422   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
423   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
424   PetscInt           i;
425 
426   PetscFunctionBegin;
427   PetscCall(PetscLayoutSetUp(B->rmap));
428   PetscCall(PetscLayoutSetUp(B->cmap));
429   if (PetscDefined(USE_DEBUG) && d_nnz) {
430     for (i=0; i<B->rmap->n; i++) {
431       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
432     }
433   }
434   if (PetscDefined(USE_DEBUG) && o_nnz) {
435     for (i=0; i<B->rmap->n; i++) {
436       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
437     }
438   }
439 #if defined(PETSC_USE_CTABLE)
440   PetscCall(PetscTableDestroy(&b->colmap));
441 #else
442   PetscCall(PetscFree(b->colmap));
443 #endif
444   PetscCall(PetscFree(b->garray));
445   PetscCall(VecDestroy(&b->lvec));
446   PetscCall(VecScatterDestroy(&b->Mvctx));
447   /* Because the B will have been resized we simply destroy it and create a new one each time */
448   PetscCall(MatDestroy(&b->B));
449   if (!b->A) {
450     PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
451     PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
452     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
453   }
454   if (!b->B) {
455     PetscMPIInt size;
456     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
457     PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
458     PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0));
459     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
460   }
461   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
462   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
463   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
464   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
465   PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz));
466   PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz));
467   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat));
468   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat));
469   PetscCall(MatCUSPARSESetHandle(b->A,cusparseStruct->handle));
470   PetscCall(MatCUSPARSESetHandle(b->B,cusparseStruct->handle));
471   B->preallocated = PETSC_TRUE;
472   PetscFunctionReturn(0);
473 }
474 
475 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
476 {
477   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
478 
479   PetscFunctionBegin;
480   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
481   PetscCall((*a->A->ops->mult)(a->A,xx,yy));
482   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
483   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy));
484   PetscFunctionReturn(0);
485 }
486 
487 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
488 {
489   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
490 
491   PetscFunctionBegin;
492   PetscCall(MatZeroEntries(l->A));
493   PetscCall(MatZeroEntries(l->B));
494   PetscFunctionReturn(0);
495 }
496 
497 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
498 {
499   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
500 
501   PetscFunctionBegin;
502   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
503   PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz));
504   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
505   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz));
506   PetscFunctionReturn(0);
507 }
508 
509 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
510 {
511   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
512 
513   PetscFunctionBegin;
514   PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec));
515   PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy));
516   PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
517   PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
518   PetscFunctionReturn(0);
519 }
520 
521 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
522 {
523   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
524   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
525 
526   PetscFunctionBegin;
527   switch (op) {
528   case MAT_CUSPARSE_MULT_DIAG:
529     cusparseStruct->diagGPUMatFormat = format;
530     break;
531   case MAT_CUSPARSE_MULT_OFFDIAG:
532     cusparseStruct->offdiagGPUMatFormat = format;
533     break;
534   case MAT_CUSPARSE_ALL:
535     cusparseStruct->diagGPUMatFormat    = format;
536     cusparseStruct->offdiagGPUMatFormat = format;
537     break;
538   default:
539     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
540   }
541   PetscFunctionReturn(0);
542 }
543 
544 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
545 {
546   MatCUSPARSEStorageFormat format;
547   PetscBool                flg;
548   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
549   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
550 
551   PetscFunctionBegin;
552   PetscCall(PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options"));
553   if (A->factortype==MAT_FACTOR_NONE) {
554     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
555                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
556     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format));
557     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
558                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
559     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format));
560     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
561                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
562     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
563   }
564   PetscCall(PetscOptionsTail());
565   PetscFunctionReturn(0);
566 }
567 
568 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
569 {
570   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
571   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
572   PetscObjectState   onnz = A->nonzerostate;
573 
574   PetscFunctionBegin;
575   PetscCall(MatAssemblyEnd_MPIAIJ(A,mode));
576   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA));
577   if (onnz != A->nonzerostate && cusp->deviceMat) {
578     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
579 
580     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
581     PetscCall(PetscNew(&h_mat));
582     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
583     PetscCallCUDA(cudaFree(h_mat->colmap));
584     if (h_mat->allocated_indices) {
585       PetscCallCUDA(cudaFree(h_mat->diag.i));
586       PetscCallCUDA(cudaFree(h_mat->diag.j));
587       if (h_mat->offdiag.j) {
588         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
589         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
590       }
591     }
592     PetscCallCUDA(cudaFree(d_mat));
593     PetscCall(PetscFree(h_mat));
594     cusp->deviceMat = NULL;
595   }
596   PetscFunctionReturn(0);
597 }
598 
599 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
600 {
601   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
602   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
603 
604   PetscFunctionBegin;
605   PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
606   if (cusparseStruct->deviceMat) {
607     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
608 
609     PetscCall(PetscInfo(A,"Have device matrix\n"));
610     PetscCall(PetscNew(&h_mat));
611     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
612     PetscCallCUDA(cudaFree(h_mat->colmap));
613     if (h_mat->allocated_indices) {
614       PetscCallCUDA(cudaFree(h_mat->diag.i));
615       PetscCallCUDA(cudaFree(h_mat->diag.j));
616       if (h_mat->offdiag.j) {
617         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
618         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
619       }
620     }
621     PetscCallCUDA(cudaFree(d_mat));
622     PetscCall(PetscFree(h_mat));
623   }
624   try {
625     if (aij->A) PetscCall(MatCUSPARSEClearHandle(aij->A));
626     if (aij->B) PetscCall(MatCUSPARSEClearHandle(aij->B));
627     PetscCallCUSPARSE(cusparseDestroy(cusparseStruct->handle));
628     /* We want cusparseStruct to use PetscDefaultCudaStream
629     if (cusparseStruct->stream) {
630       PetscCallCUDA(cudaStreamDestroy(cusparseStruct->stream));
631     }
632     */
633     /* Free COO */
634     PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
635     delete cusparseStruct;
636   } catch(char *ex) {
637     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
638   }
639   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL));
640   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL));
641   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
642   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
643   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
644   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL));
645   PetscCall(MatDestroy_MPIAIJ(A));
646   PetscFunctionReturn(0);
647 }
648 
649 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
650 {
651   Mat_MPIAIJ *a;
652   Mat         A;
653 
654   PetscFunctionBegin;
655   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
656   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat));
657   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN));
658   A = *newmat;
659   A->boundtocpu = PETSC_FALSE;
660   PetscCall(PetscFree(A->defaultvectype));
661   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
662 
663   a = (Mat_MPIAIJ*)A->data;
664   if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE));
665   if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE));
666   if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA));
667 
668   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
669     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
670     PetscCallCUSPARSE(cusparseCreate(&(cusp->handle)));
671     a->spptr = cusp;
672   }
673 
674   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
675   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
676   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
677   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
678   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
679   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
680   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
681   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
682 
683   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE));
684   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
685   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
686   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE));
687   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE));
688   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE));
689 #if defined(PETSC_HAVE_HYPRE)
690   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
691 #endif
692   PetscFunctionReturn(0);
693 }
694 
695 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
696 {
697   PetscFunctionBegin;
698   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
699   PetscCall(MatCreate_MPIAIJ(A));
700   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A));
701   PetscFunctionReturn(0);
702 }
703 
704 /*@
705    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
706    (the default parallel PETSc format).  This matrix will ultimately pushed down
707    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
708    assembly performance the user should preallocate the matrix storage by setting
709    the parameter nz (or the array nnz).  By setting these parameters accurately,
710    performance during matrix assembly can be increased by more than a factor of 50.
711 
712    Collective
713 
714    Input Parameters:
715 +  comm - MPI communicator, set to PETSC_COMM_SELF
716 .  m - number of rows
717 .  n - number of columns
718 .  nz - number of nonzeros per row (same for all rows)
719 -  nnz - array containing the number of nonzeros in the various rows
720          (possibly different for each row) or NULL
721 
722    Output Parameter:
723 .  A - the matrix
724 
725    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
726    MatXXXXSetPreallocation() paradigm instead of this routine directly.
727    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
728 
729    Notes:
730    If nnz is given then nz is ignored
731 
732    The AIJ format (also called the Yale sparse matrix format or
733    compressed row storage), is fully compatible with standard Fortran 77
734    storage.  That is, the stored row and column indices can begin at
735    either one (as in Fortran) or zero.  See the users' manual for details.
736 
737    Specify the preallocated storage with either nz or nnz (not both).
738    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
739    allocation.  For large problems you MUST preallocate memory or you
740    will get TERRIBLE performance, see the users' manual chapter on matrices.
741 
742    By default, this format uses inodes (identical nodes) when possible, to
743    improve numerical efficiency of matrix-vector products and solves. We
744    search for consecutive rows with the same nonzero structure, thereby
745    reusing matrix information to achieve increased efficiency.
746 
747    Level: intermediate
748 
749 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
750 @*/
751 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
752 {
753   PetscMPIInt size;
754 
755   PetscFunctionBegin;
756   PetscCall(MatCreate(comm,A));
757   PetscCall(MatSetSizes(*A,m,n,M,N));
758   PetscCallMPI(MPI_Comm_size(comm,&size));
759   if (size > 1) {
760     PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE));
761     PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz));
762   } else {
763     PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
764     PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz));
765   }
766   PetscFunctionReturn(0);
767 }
768 
769 /*MC
770    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
771 
772    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
773    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
774    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
775 
776    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
777    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
778    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
779    for communicators controlling multiple processes.  It is recommended that you call both of
780    the above preallocation routines for simplicity.
781 
782    Options Database Keys:
783 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
784 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
785 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
786 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
787 
788   Level: beginner
789 
790  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
791 M*/
792 
793 /*MC
794    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
795 
796   Level: beginner
797 
798  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
799 M*/
800 
801 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
802 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
803 {
804   PetscSplitCSRDataStructure d_mat;
805   PetscMPIInt                size;
806   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
807   PetscScalar                *aa = NULL,*ba = NULL;
808   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
809   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
810   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
811 
812   PetscFunctionBegin;
813   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
814   if (A->factortype != MAT_FACTOR_NONE) {
815     *B = NULL;
816     PetscFunctionReturn(0);
817   }
818   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
819   // get jaca
820   if (size == 1) {
821     PetscBool isseqaij;
822 
823     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij));
824     if (isseqaij) {
825       jaca = (Mat_SeqAIJ*)A->data;
826       PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
827       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
828       d_mat = cusparsestructA->deviceMat;
829       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
830     } else {
831       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
832       PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
833       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
834       jaca = (Mat_SeqAIJ*)aij->A->data;
835       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
836       d_mat = spptr->deviceMat;
837       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
838     }
839     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
840       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
841       PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
842       matrixA = (CsrMatrix*)matstruct->mat;
843       bi = NULL;
844       bj = NULL;
845       ba = NULL;
846     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
847   } else {
848     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
849     PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
850     jaca = (Mat_SeqAIJ*)aij->A->data;
851     jacb = (Mat_SeqAIJ*)aij->B->data;
852     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
853 
854     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
855     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
856     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
857     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
858     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
859       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
860       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
861       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
862       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
863       PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
864       PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
865       matrixA = (CsrMatrix*)matstructA->mat;
866       matrixB = (CsrMatrix*)matstructB->mat;
867       if (jacb->compressedrow.use) {
868         if (!cusparsestructB->rowoffsets_gpu) {
869           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
870           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
871         }
872         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
873       } else {
874         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
875       }
876       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
877       ba = thrust::raw_pointer_cast(matrixB->values->data());
878     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
879     d_mat = spptr->deviceMat;
880   }
881   if (jaca->compressedrow.use) {
882     if (!cusparsestructA->rowoffsets_gpu) {
883       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
884       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
885     }
886     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
887   } else {
888     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
889   }
890   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
891   aa = thrust::raw_pointer_cast(matrixA->values->data());
892 
893   if (!d_mat) {
894     PetscSplitCSRDataStructure h_mat;
895 
896     // create and populate strucy on host and copy on device
897     PetscCall(PetscInfo(A,"Create device matrix\n"));
898     PetscCall(PetscNew(&h_mat));
899     PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat)));
900     if (size > 1) { /* need the colmap array */
901       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
902       PetscInt   *colmap;
903       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
904 
905       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
906 
907       PetscCall(PetscCalloc1(N+1,&colmap));
908       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
909 #if defined(PETSC_USE_64BIT_INDICES)
910       { // have to make a long version of these
911         int        *h_bi32, *h_bj32;
912         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
913         PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64));
914         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost));
915         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
916         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost));
917         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
918 
919         PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64)));
920         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice));
921         PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64)));
922         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice));
923 
924         h_mat->offdiag.i = d_bi64;
925         h_mat->offdiag.j = d_bj64;
926         h_mat->allocated_indices = PETSC_TRUE;
927 
928         PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64));
929       }
930 #else
931       h_mat->offdiag.i = (PetscInt*)bi;
932       h_mat->offdiag.j = (PetscInt*)bj;
933       h_mat->allocated_indices = PETSC_FALSE;
934 #endif
935       h_mat->offdiag.a = ba;
936       h_mat->offdiag.n = A->rmap->n;
937 
938       PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap)));
939       PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice));
940       PetscCall(PetscFree(colmap));
941     }
942     h_mat->rstart = A->rmap->rstart;
943     h_mat->rend   = A->rmap->rend;
944     h_mat->cstart = A->cmap->rstart;
945     h_mat->cend   = A->cmap->rend;
946     h_mat->M      = A->cmap->N;
947 #if defined(PETSC_USE_64BIT_INDICES)
948     {
949       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
950       int        *h_ai32, *h_aj32;
951       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
952       PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64));
953       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost));
954       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
955       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost));
956       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
957 
958       PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64)));
959       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice));
960       PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64)));
961       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice));
962 
963       h_mat->diag.i = d_ai64;
964       h_mat->diag.j = d_aj64;
965       h_mat->allocated_indices = PETSC_TRUE;
966 
967       PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64));
968     }
969 #else
970     h_mat->diag.i = (PetscInt*)ai;
971     h_mat->diag.j = (PetscInt*)aj;
972     h_mat->allocated_indices = PETSC_FALSE;
973 #endif
974     h_mat->diag.a = aa;
975     h_mat->diag.n = A->rmap->n;
976     h_mat->rank   = PetscGlobalRank;
977     // copy pointers and metadata to device
978     PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice));
979     PetscCall(PetscFree(h_mat));
980   } else {
981     PetscCall(PetscInfo(A,"Reusing device matrix\n"));
982   }
983   *B = d_mat;
984   A->offloadmask = PETSC_OFFLOAD_GPU;
985   PetscFunctionReturn(0);
986 }
987