xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 6a29ce69e29aaf78b7325cac1ae663c7f0b163da)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28   cudaError_t        cerr;
29 
30   PetscFunctionBegin;
31   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     cerr = WaitForCUDA();CHKERRCUDA(cerr);
52     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
53     delete w;
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
55     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
56   } else {
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
58     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
59   }
60   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
61   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
62   A->num_ass++;
63   A->assembled        = PETSC_TRUE;
64   A->ass_nonzerostate = A->nonzerostate;
65   A->offloadmask      = PETSC_OFFLOAD_GPU;
66   PetscFunctionReturn(0);
67 }
68 
69 template <typename Tuple>
70 struct IsNotOffDiagT
71 {
72   PetscInt _cstart,_cend;
73 
74   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
75   __host__ __device__
76   inline bool operator()(Tuple t)
77   {
78     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
79   }
80 };
81 
82 struct IsOffDiag
83 {
84   PetscInt _cstart,_cend;
85 
86   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
87   __host__ __device__
88   inline bool operator() (const PetscInt &c)
89   {
90     return c < _cstart || c >= _cend;
91   }
92 };
93 
94 struct GlobToLoc
95 {
96   PetscInt _start;
97 
98   GlobToLoc(PetscInt start) : _start(start) {}
99   __host__ __device__
100   inline PetscInt operator() (const PetscInt &c)
101   {
102     return c - _start;
103   }
104 };
105 
106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
107 {
108   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
109   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
110   PetscErrorCode         ierr;
111   PetscInt               *jj;
112   size_t                 noff = 0;
113   THRUSTINTARRAY         d_i(n);
114   THRUSTINTARRAY         d_j(n);
115   ISLocalToGlobalMapping l2g;
116   cudaError_t            cerr;
117 
118   PetscFunctionBegin;
119   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
120   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
121   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
122   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
123   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
124   ierr = PetscFree(b->garray);CHKERRQ(ierr);
125   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
126   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
127   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
128 
129   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
130   d_i.assign(coo_i,coo_i+n);
131   d_j.assign(coo_j,coo_j+n);
132   delete cusp->coo_p;
133   delete cusp->coo_pw;
134   cusp->coo_p = NULL;
135   cusp->coo_pw = NULL;
136   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
137   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
138   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
139   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
140     cusp->coo_p = new THRUSTINTARRAY(n);
141     cusp->coo_pw = new THRUSTARRAY(n);
142     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
143     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
144     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
145     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
146     firstoffd = mzipp.get_iterator_tuple().get<1>();
147   }
148   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
149   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
150 
151   /* from global to local */
152   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
153   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
154   cerr = WaitForCUDA();CHKERRCUDA(cerr);
155   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
156 
157   /* copy offdiag column indices to map on the CPU */
158   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
159   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
160   auto o_j = d_j.begin();
161   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
162   thrust::advance(o_j,cusp->coo_nd);
163   thrust::sort(thrust::device,o_j,d_j.end());
164   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
166   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
167   noff = thrust::distance(o_j,wit);
168   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
169   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
170   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
171   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
172   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
173   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
174   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
175   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
176 
177   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
178   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
179   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
180   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
181   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
182   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
183   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
184   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
185 
186   /* GPU memory, cusparse specific call handles it internally */
187   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
188   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
189   ierr = PetscFree(jj);CHKERRQ(ierr);
190 
191   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
192   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
193   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
194   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
195   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
196   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
197   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
198   B->preallocated = PETSC_TRUE;
199   B->nonzerostate++;
200   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
201 
202   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
203   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
204   B->offloadmask = PETSC_OFFLOAD_CPU;
205   B->assembled = PETSC_FALSE;
206   B->was_assembled = PETSC_FALSE;
207   PetscFunctionReturn(0);
208 }
209 
210 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
211 {
212   Mat            Ad,Ao;
213   const PetscInt *cmap;
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
218   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
219   if (glob) {
220     PetscInt cst, i, dn, on, *gidx;
221 
222     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
223     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
224     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
225     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
226     for (i=0; i<dn; i++) gidx[i]    = cst + i;
227     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
228     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
234 {
235   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
236   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
237   PetscErrorCode     ierr;
238   PetscInt           i;
239 
240   PetscFunctionBegin;
241   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
242   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
243   if (PetscDefined(USE_DEBUG) && d_nnz) {
244     for (i=0; i<B->rmap->n; i++) {
245       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
246     }
247   }
248   if (PetscDefined(USE_DEBUG) && o_nnz) {
249     for (i=0; i<B->rmap->n; i++) {
250       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
251     }
252   }
253 #if defined(PETSC_USE_CTABLE)
254   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
255 #else
256   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
257 #endif
258   ierr = PetscFree(b->garray);CHKERRQ(ierr);
259   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
260   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
261   /* Because the B will have been resized we simply destroy it and create a new one each time */
262   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
263   if (!b->A) {
264     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
265     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
266     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
267   }
268   if (!b->B) {
269     PetscMPIInt size;
270     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
271     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
272     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
273     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
274   }
275   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
276   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
277   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
278   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
279   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
280   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
281   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
282   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
283   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
284   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
285   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
286   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
287 
288   B->preallocated = PETSC_TRUE;
289   PetscFunctionReturn(0);
290 }
291 
292 typedef struct {
293   Mat         *mp;    /* intermediate products */
294   PetscBool   *mptmp; /* is the intermediate product temporary ? */
295   PetscInt    cp;     /* number of intermediate products */
296 
297   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
298   PetscInt    *startsj_s,*startsj_r;
299   PetscScalar *bufa;
300   Mat         P_oth;
301 
302   /* may take advantage of merging product->B */
303   Mat Bloc;
304 
305   /* cusparse does not have support to split between symbolic and numeric phases
306      When api_user is true, we don't need to update the numerical values
307      of the temporary storage */
308   PetscBool reusesym;
309 
310   /* support for COO values insertion */
311   PetscScalar *coo_v,*coo_w;
312   PetscSF     sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */
313 } MatMatMPIAIJCUSPARSE;
314 
315 PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data)
316 {
317   MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data;
318   PetscInt             i;
319   PetscErrorCode       ierr;
320 
321   PetscFunctionBegin;
322   ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr);
323   ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr);
324   ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr);
325   ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr);
326   ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr);
327   ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr);
328   ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr);
329   for (i = 0; i < mmdata->cp; i++) {
330     ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr);
331   }
332   ierr = PetscFree(mmdata->mp);CHKERRQ(ierr);
333   ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr);
334   ierr = PetscFree(mmdata);CHKERRQ(ierr);
335   PetscFunctionReturn(0);
336 }
337 
338 static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
339 {
340   MatMatMPIAIJCUSPARSE *mmdata;
341   PetscScalar          *tmp;
342   PetscInt             i,n;
343   PetscErrorCode       ierr;
344 
345   PetscFunctionBegin;
346   MatCheckProduct(C,1);
347   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
348   mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data;
349   tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v;
350   if (!mmdata->reusesym) { /* update temporary matrices */
351     if (mmdata->P_oth) {
352       ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
353     }
354     if (mmdata->Bloc) {
355       ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr);
356     }
357   }
358   mmdata->reusesym = PETSC_FALSE;
359   for (i = 0, n = 0; i < mmdata->cp; i++) {
360     Mat_SeqAIJ        *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data;
361     const PetscScalar *vv;
362 
363     if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]);
364     ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr);
365     if (mmdata->mptmp[i]) continue;
366     /* TODO: add support for using GPU data directly */
367     ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
368     ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr);
369     ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
370     n   += mm->nz;
371   }
372   if (mmdata->sf) { /* offprocess insertion */
373     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
374     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
375   }
376   ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 /* Pt * A or A * P */
381 #define MAX_NUMBER_INTERMEDIATE 4
382 static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
383 {
384   Mat_Product            *product = C->product;
385   Mat                    A,P,mp[MAX_NUMBER_INTERMEDIATE];
386   Mat_MPIAIJ             *a,*p;
387   MatMatMPIAIJCUSPARSE   *mmdata;
388   ISLocalToGlobalMapping P_oth_l2g = NULL;
389   IS                     glob = NULL;
390   const PetscInt         *globidx,*P_oth_idx;
391   const PetscInt         *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE];
392   PetscInt               cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j;
393   MatProductType         ptype;
394   PetscBool              mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE;
395   PetscMPIInt            size;
396   PetscErrorCode         ierr;
397 
398   PetscFunctionBegin;
399   MatCheckProduct(C,1);
400   if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
401   ptype = product->type;
402   if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
403   switch (ptype) {
404   case MATPRODUCT_AB:
405     A = product->A;
406     P = product->B;
407     m = A->rmap->n;
408     n = P->cmap->n;
409     M = A->rmap->N;
410     N = P->cmap->N;
411     break;
412   case MATPRODUCT_AtB:
413     P = product->A;
414     A = product->B;
415     m = P->cmap->n;
416     n = A->cmap->n;
417     M = P->cmap->N;
418     N = A->cmap->N;
419     hasoffproc = PETSC_TRUE;
420     break;
421   case MATPRODUCT_PtAP:
422     A = product->A;
423     P = product->B;
424     m = P->cmap->n;
425     n = P->cmap->n;
426     M = P->cmap->N;
427     N = P->cmap->N;
428     hasoffproc = PETSC_TRUE;
429     break;
430   default:
431     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
432   }
433   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr);
434   if (size == 1) hasoffproc = PETSC_FALSE;
435 
436   for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) {
437     mp[i]    = NULL;
438     mptmp[i] = PETSC_FALSE;
439     rmapt[i] = 0;
440     cmapt[i] = 0;
441     rmapa[i] = NULL;
442     cmapa[i] = NULL;
443   }
444 
445   a = (Mat_MPIAIJ*)A->data;
446   p = (Mat_MPIAIJ*)P->data;
447   ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr);
448   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
449   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
450   ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
451   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
452   mmdata->reusesym = product->api_user;
453   switch (ptype) {
454   case MATPRODUCT_AB: /* A * P */
455     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
456 
457     if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */
458       /* P is product->B */
459       ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
460       ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
461       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
462       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
463       mp[cp]->product->api_user = product->api_user;
464       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
465       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
466       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
467       ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
468       rmapt[cp] = 1;
469       cmapt[cp] = 2;
470       cmapa[cp] = globidx;
471       mptmp[cp] = PETSC_FALSE;
472       cp++;
473     } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */
474       ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr);
475       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
476       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
477       mp[cp]->product->api_user = product->api_user;
478       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
479       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
480       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
481       rmapt[cp] = 1;
482       cmapt[cp] = 1;
483       mptmp[cp] = PETSC_FALSE;
484       cp++;
485       ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr);
486       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
487       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
488       mp[cp]->product->api_user = product->api_user;
489       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
490       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
491       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
492       rmapt[cp] = 1;
493       cmapt[cp] = 2;
494       cmapa[cp] = p->garray;
495       mptmp[cp] = PETSC_FALSE;
496       cp++;
497     }
498     if (mmdata->P_oth) {
499       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
500       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
501       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
502       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
503       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
504       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
505       mp[cp]->product->api_user = product->api_user;
506       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
507       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
508       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
509       rmapt[cp] = 1;
510       cmapt[cp] = 2;
511       cmapa[cp] = P_oth_idx;
512       mptmp[cp] = PETSC_FALSE;
513       cp++;
514     }
515     break;
516   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
517     /* A is product->B */
518     ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
519     ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
520     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
521     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
522     mp[cp]->product->api_user = product->api_user;
523     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
524     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
525     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
526     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
527     rmapt[cp] = 1;
528     cmapt[cp] = 2;
529     cmapa[cp] = globidx;
530     mptmp[cp] = PETSC_FALSE;
531     cp++;
532     ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
533     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
534     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
535     mp[cp]->product->api_user = product->api_user;
536     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
537     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
538     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
539     rmapt[cp] = 2;
540     rmapa[cp] = p->garray;
541     cmapt[cp] = 2;
542     cmapa[cp] = globidx;
543     mptmp[cp] = PETSC_FALSE;
544     cp++;
545     break;
546   case MATPRODUCT_PtAP:
547     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
548     /* P is product->B */
549     ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
550     ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
551     ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr);
552     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
553     mp[cp]->product->api_user = product->api_user;
554     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
555     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
556     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
557     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
558     rmapt[cp] = 2;
559     rmapa[cp] = globidx;
560     cmapt[cp] = 2;
561     cmapa[cp] = globidx;
562     mptmp[cp] = PETSC_FALSE;
563     cp++;
564     if (mmdata->P_oth) {
565       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
566       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
567       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
568       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
569       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
570       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
571       mp[cp]->product->api_user = product->api_user;
572       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
573       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
574       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
575       mptmp[cp] = PETSC_TRUE;
576       cp++;
577       ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr);
578       ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
579       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
580       mp[cp]->product->api_user = product->api_user;
581       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
582       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
583       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
584       rmapt[cp] = 2;
585       rmapa[cp] = globidx;
586       cmapt[cp] = 2;
587       cmapa[cp] = P_oth_idx;
588       mptmp[cp] = PETSC_FALSE;
589       cp++;
590     }
591     break;
592   default:
593     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
594   }
595   ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr);
596   for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i];
597   ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr);
598   for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i];
599   mmdata->cp = cp;
600   C->product->data       = mmdata;
601   C->product->destroy    = MatDestroy_MatMatMPIAIJCUSPARSE;
602   C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
603 
604   /* prepare coo coordinates for values insertion */
605   ncoo = 0;
606   for (cp = 0; cp < mmdata->cp; cp++) {
607     Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data;
608     if (mptmp[cp]) continue;
609     ncoo += mm->nz;
610   }
611   ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr);
612   ncoo = 0;
613   for (cp = 0; cp < mmdata->cp; cp++) {
614     Mat_SeqAIJ     *mm = (Mat_SeqAIJ*)mp[cp]->data;
615     PetscInt       *coi = coo_i + ncoo;
616     PetscInt       *coj = coo_j + ncoo;
617     const PetscInt mr = mp[cp]->rmap->n;
618     const PetscInt *jj  = mm->j;
619     const PetscInt *ii  = mm->i;
620 
621     if (mptmp[cp]) continue;
622     /* rows coo */
623     if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen");
624     else if (rmapt[cp] == 1) { /* local to global for owned rows of  */
625       const PetscInt rs = C->rmap->rstart;
626       for (i = 0; i < mr; i++) {
627         const PetscInt gr = i + rs;
628         for (j = ii[i]; j < ii[i+1]; j++) {
629           coi[j] = gr;
630         }
631       }
632     } else { /* offprocess */
633       const PetscInt *rmap = rmapa[cp];
634       for (i = 0; i < mr; i++) {
635         const PetscInt gr = rmap[i];
636         for (j = ii[i]; j < ii[i+1]; j++) {
637           coi[j] = gr;
638         }
639       }
640     }
641     /* columns coo */
642     if (!cmapt[cp]) {
643       ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr);
644     } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */
645       const PetscInt cs = P->cmap->rstart;
646       for (j = 0; j < mm->nz; j++) {
647         coj[j] = jj[j] + cs;
648       }
649     } else { /* offdiag */
650       const PetscInt *cmap = cmapa[cp];
651       for (j = 0; j < mm->nz; j++) {
652         coj[j] = cmap[jj[j]];
653       }
654     }
655     ncoo += mm->nz;
656   }
657   if (glob) {
658     ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr);
659   }
660   ierr = ISDestroy(&glob);CHKERRQ(ierr);
661   if (P_oth_l2g) {
662     ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
663   }
664   ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr);
665 
666   if (hasoffproc) { /* offproc values insertion */
667     const PetscInt *sfdeg;
668     const PetscInt n = P->cmap->n;
669     PetscInt ncoo2,*coo_i2,*coo_j2;
670 
671     ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr);
672     ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr);
673     ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr);
674     ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr);
675     for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i];
676     ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr);
677     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
678     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
679     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
680     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
681     ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
682     ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr);
683     coo_i = coo_i2;
684     coo_j = coo_j2;
685     ncoo  = ncoo2;
686   }
687 
688   /* preallocate with COO data */
689   ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr);
690   ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr);
691   ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
692   PetscFunctionReturn(0);
693 }
694 
695 static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat)
696 {
697   Mat_Product    *product = mat->product;
698   PetscErrorCode ierr;
699   PetscBool      Biscusp = PETSC_FALSE;
700 
701   PetscFunctionBegin;
702   MatCheckProduct(mat,1);
703   if (!product->B->boundtocpu) {
704     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
705   }
706   if (Biscusp) {
707     switch (product->type) {
708     case MATPRODUCT_AB:
709     case MATPRODUCT_AtB:
710     case MATPRODUCT_PtAP:
711       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
712       break;
713     default:
714       break;
715     }
716   }
717   if (!mat->ops->productsymbolic) {
718     ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr);
719   }
720   PetscFunctionReturn(0);
721 }
722 
723 /*@
724    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
725 
726    Not collective
727 
728    Input Parameters:
729 +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
730 -  gen - the boolean flag
731 
732    Level: intermediate
733 
734 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
735 @*/
736 PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
737 {
738   PetscErrorCode ierr;
739   PetscBool      ismpiaij;
740 
741   PetscFunctionBegin;
742   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
743   MatCheckPreallocated(A,1);
744   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
745   if (ismpiaij) {
746     Mat A_d,A_o;
747 
748     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
749     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
750     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
751   } else {
752     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
753   }
754   PetscFunctionReturn(0);
755 }
756 
757 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
758 {
759   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
760   PetscErrorCode ierr;
761   PetscInt       nt;
762 
763   PetscFunctionBegin;
764   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
765   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
766   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
767   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
768   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
769   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
770   PetscFunctionReturn(0);
771 }
772 
773 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
774 {
775   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
776   PetscErrorCode ierr;
777 
778   PetscFunctionBegin;
779   if (A->factortype == MAT_FACTOR_NONE) {
780     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
781     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
782     if (d_mat) {
783       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
784       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
785       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
786       cudaError_t  err;
787       PetscScalar  *vals;
788       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
789       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
790       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
791       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
792       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
793     }
794   }
795   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
796   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
797 
798   PetscFunctionReturn(0);
799 }
800 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
801 {
802   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
803   PetscErrorCode ierr;
804   PetscInt       nt;
805 
806   PetscFunctionBegin;
807   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
808   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
809   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
810   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
811   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
812   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
813   PetscFunctionReturn(0);
814 }
815 
816 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
817 {
818   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
819   PetscErrorCode ierr;
820   PetscInt       nt;
821 
822   PetscFunctionBegin;
823   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
824   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
825   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
826   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
827   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
828   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
829   PetscFunctionReturn(0);
830 }
831 
832 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
833 {
834   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
835   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
836 
837   PetscFunctionBegin;
838   switch (op) {
839   case MAT_CUSPARSE_MULT_DIAG:
840     cusparseStruct->diagGPUMatFormat = format;
841     break;
842   case MAT_CUSPARSE_MULT_OFFDIAG:
843     cusparseStruct->offdiagGPUMatFormat = format;
844     break;
845   case MAT_CUSPARSE_ALL:
846     cusparseStruct->diagGPUMatFormat    = format;
847     cusparseStruct->offdiagGPUMatFormat = format;
848     break;
849   default:
850     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
851   }
852   PetscFunctionReturn(0);
853 }
854 
855 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
856 {
857   MatCUSPARSEStorageFormat format;
858   PetscErrorCode           ierr;
859   PetscBool                flg;
860   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
861   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
862 
863   PetscFunctionBegin;
864   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
865   if (A->factortype==MAT_FACTOR_NONE) {
866     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
867                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
868     if (flg) {
869       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
870     }
871     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
872                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
873     if (flg) {
874       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
875     }
876     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
877                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
878     if (flg) {
879       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
880     }
881   }
882   ierr = PetscOptionsTail();CHKERRQ(ierr);
883   PetscFunctionReturn(0);
884 }
885 
886 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
887 {
888   PetscErrorCode             ierr;
889   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
890   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
891   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
892   PetscFunctionBegin;
893   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
894   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
895     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
896   }
897   if (d_mat) {
898     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
899   }
900 
901   PetscFunctionReturn(0);
902 }
903 
904 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
905 {
906   PetscErrorCode     ierr;
907   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
908   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
909   cudaError_t        err;
910   cusparseStatus_t   stat;
911 
912   PetscFunctionBegin;
913   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
914   if (cusparseStruct->deviceMat) {
915     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
916     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
917     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
918     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
919     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
920     if (jaca->compressedrow.use) {
921       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
922     }
923     if (jacb->compressedrow.use) {
924       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
925     }
926     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
927     err = cudaFree(d_mat);CHKERRCUDA(err);
928   }
929   try {
930     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
931     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
932     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
933     if (cusparseStruct->stream) {
934       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
935     }
936     delete cusparseStruct->coo_p;
937     delete cusparseStruct->coo_pw;
938     delete cusparseStruct;
939   } catch(char *ex) {
940     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
941   }
942   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
943   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
944   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
945   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
946   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
947   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
948   PetscFunctionReturn(0);
949 }
950 
951 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
952 {
953   PetscErrorCode     ierr;
954   Mat_MPIAIJ         *a;
955   Mat_MPIAIJCUSPARSE *cusparseStruct;
956   cusparseStatus_t   stat;
957   Mat                A;
958 
959   PetscFunctionBegin;
960   if (reuse == MAT_INITIAL_MATRIX) {
961     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
962   } else if (reuse == MAT_REUSE_MATRIX) {
963     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
964   }
965   A = *newmat;
966 
967   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
968   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
969 
970   a = (Mat_MPIAIJ*)A->data;
971   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
972   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
973   if (a->lvec) {
974     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
975   }
976 
977   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
978     a->spptr = new Mat_MPIAIJCUSPARSE;
979 
980     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
981     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
982     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
983     cusparseStruct->coo_p               = NULL;
984     cusparseStruct->coo_pw              = NULL;
985     cusparseStruct->stream              = 0;
986     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
987     cusparseStruct->deviceMat = NULL;
988   }
989 
990   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
991   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
992   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
993   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
994   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
995   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
996   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
997   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE;
998 
999   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
1000   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
1001   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
1002   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
1003   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
1004   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
1005   PetscFunctionReturn(0);
1006 }
1007 
1008 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
1009 {
1010   PetscErrorCode ierr;
1011 
1012   PetscFunctionBegin;
1013   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
1014   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
1015   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1016   PetscFunctionReturn(0);
1017 }
1018 
1019 /*@
1020    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1021    (the default parallel PETSc format).  This matrix will ultimately pushed down
1022    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1023    assembly performance the user should preallocate the matrix storage by setting
1024    the parameter nz (or the array nnz).  By setting these parameters accurately,
1025    performance during matrix assembly can be increased by more than a factor of 50.
1026 
1027    Collective
1028 
1029    Input Parameters:
1030 +  comm - MPI communicator, set to PETSC_COMM_SELF
1031 .  m - number of rows
1032 .  n - number of columns
1033 .  nz - number of nonzeros per row (same for all rows)
1034 -  nnz - array containing the number of nonzeros in the various rows
1035          (possibly different for each row) or NULL
1036 
1037    Output Parameter:
1038 .  A - the matrix
1039 
1040    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1041    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1042    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1043 
1044    Notes:
1045    If nnz is given then nz is ignored
1046 
1047    The AIJ format (also called the Yale sparse matrix format or
1048    compressed row storage), is fully compatible with standard Fortran 77
1049    storage.  That is, the stored row and column indices can begin at
1050    either one (as in Fortran) or zero.  See the users' manual for details.
1051 
1052    Specify the preallocated storage with either nz or nnz (not both).
1053    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1054    allocation.  For large problems you MUST preallocate memory or you
1055    will get TERRIBLE performance, see the users' manual chapter on matrices.
1056 
1057    By default, this format uses inodes (identical nodes) when possible, to
1058    improve numerical efficiency of matrix-vector products and solves. We
1059    search for consecutive rows with the same nonzero structure, thereby
1060    reusing matrix information to achieve increased efficiency.
1061 
1062    Level: intermediate
1063 
1064 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
1065 @*/
1066 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
1067 {
1068   PetscErrorCode ierr;
1069   PetscMPIInt    size;
1070 
1071   PetscFunctionBegin;
1072   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1073   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
1074   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1075   if (size > 1) {
1076     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
1077     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1078   } else {
1079     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1080     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
1081   }
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 /*MC
1086    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
1087 
1088    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1089    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1090    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1091 
1092    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
1093    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
1094    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
1095    for communicators controlling multiple processes.  It is recommended that you call both of
1096    the above preallocation routines for simplicity.
1097 
1098    Options Database Keys:
1099 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
1100 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1101 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1102 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1103 
1104   Level: beginner
1105 
1106  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1107 M
1108 M*/
1109 
1110 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1111 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
1112 {
1113 #if defined(PETSC_USE_CTABLE)
1114   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
1115 #else
1116   PetscSplitCSRDataStructure **p_d_mat;
1117   PetscMPIInt                size,rank;
1118   MPI_Comm                   comm;
1119   PetscErrorCode             ierr;
1120   int                        *ai,*bi,*aj,*bj;
1121   PetscScalar                *aa,*ba;
1122 
1123   PetscFunctionBegin;
1124   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1125   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1126   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
1127   if (A->factortype == MAT_FACTOR_NONE) {
1128     CsrMatrix *matrixA,*matrixB=NULL;
1129     if (size == 1) {
1130       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1131       p_d_mat = &cusparsestruct->deviceMat;
1132       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1133       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1134         matrixA = (CsrMatrix*)matstruct->mat;
1135         bi = bj = NULL; ba = NULL;
1136       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
1137     } else {
1138       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
1139       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
1140       p_d_mat = &spptr->deviceMat;
1141       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
1142       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
1143       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
1144       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
1145       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
1146         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
1147         matrixA = (CsrMatrix*)matstructA->mat;
1148         matrixB = (CsrMatrix*)matstructB->mat;
1149         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
1150         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
1151         ba = thrust::raw_pointer_cast(matrixB->values->data());
1152       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
1153     }
1154     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
1155     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
1156     aa = thrust::raw_pointer_cast(matrixA->values->data());
1157   } else {
1158     *B = NULL;
1159     PetscFunctionReturn(0);
1160   }
1161   // act like MatSetValues because not called on host
1162   if (A->assembled) {
1163     if (A->was_assembled) {
1164       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
1165     }
1166     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1167   } else {
1168     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
1169   }
1170   if (!*p_d_mat) {
1171     cudaError_t                 err;
1172     PetscSplitCSRDataStructure  *d_mat, h_mat;
1173     Mat_SeqAIJ                  *jaca;
1174     PetscInt                    n = A->rmap->n, nnz;
1175     // create and copy
1176     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
1177     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
1178     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
1179     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
1180     if (size == 1) {
1181       jaca = (Mat_SeqAIJ*)A->data;
1182       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1183       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1184       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1185       h_mat.offdiag.a = NULL;
1186     } else {
1187       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
1188       Mat_SeqAIJ  *jacb;
1189       jaca = (Mat_SeqAIJ*)aij->A->data;
1190       jacb = (Mat_SeqAIJ*)aij->B->data;
1191       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
1192       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
1193       // create colmap - this is ussually done (lazy) in MatSetValues
1194       aij->donotstash = PETSC_TRUE;
1195       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1196       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
1197       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
1198       aij->colmap[A->cmap->N] = -9;
1199       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
1200       {
1201         PetscInt ii;
1202         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
1203       }
1204       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
1205       // allocate B copy data
1206       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1207       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1208       nnz = jacb->i[n];
1209 
1210       if (jacb->compressedrow.use) {
1211         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
1212         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1213       } else h_mat.offdiag.i = bi;
1214       h_mat.offdiag.j = bj;
1215       h_mat.offdiag.a = ba;
1216 
1217       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
1218       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1219       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1220       h_mat.offdiag.n = n;
1221     }
1222     // allocate A copy data
1223     nnz = jaca->i[n];
1224     h_mat.diag.n = n;
1225     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1226     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
1227     if (jaca->compressedrow.use) {
1228       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
1229       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1230     } else {
1231       h_mat.diag.i = ai;
1232     }
1233     h_mat.diag.j = aj;
1234     h_mat.diag.a = aa;
1235     // copy pointers and metdata to device
1236     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1237     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
1238   } else {
1239     *B = *p_d_mat;
1240   }
1241   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1242   PetscFunctionReturn(0);
1243 #endif
1244 }
1245