xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision e589036e9683f8580551b8f54d59a5f73edfef1f)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28   cudaError_t        cerr;
29 
30   PetscFunctionBegin;
31   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     cerr = WaitForCUDA();CHKERRCUDA(cerr);
52     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
53     delete w;
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
55     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
56   } else {
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
58     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
59   }
60   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
61   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
62   A->num_ass++;
63   A->assembled        = PETSC_TRUE;
64   A->ass_nonzerostate = A->nonzerostate;
65   A->offloadmask      = PETSC_OFFLOAD_GPU;
66   PetscFunctionReturn(0);
67 }
68 
69 template <typename Tuple>
70 struct IsNotOffDiagT
71 {
72   PetscInt _cstart,_cend;
73 
74   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
75   __host__ __device__
76   inline bool operator()(Tuple t)
77   {
78     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
79   }
80 };
81 
82 struct IsOffDiag
83 {
84   PetscInt _cstart,_cend;
85 
86   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
87   __host__ __device__
88   inline bool operator() (const PetscInt &c)
89   {
90     return c < _cstart || c >= _cend;
91   }
92 };
93 
94 struct GlobToLoc
95 {
96   PetscInt _start;
97 
98   GlobToLoc(PetscInt start) : _start(start) {}
99   __host__ __device__
100   inline PetscInt operator() (const PetscInt &c)
101   {
102     return c - _start;
103   }
104 };
105 
106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
107 {
108   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
109   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
110   PetscErrorCode         ierr;
111   PetscInt               *jj;
112   size_t                 noff = 0;
113   THRUSTINTARRAY         d_i(n);
114   THRUSTINTARRAY         d_j(n);
115   ISLocalToGlobalMapping l2g;
116   cudaError_t            cerr;
117 
118   PetscFunctionBegin;
119   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
120   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
121   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
122   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
123   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
124   ierr = PetscFree(b->garray);CHKERRQ(ierr);
125   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
126   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
127   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
128 
129   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
130   d_i.assign(coo_i,coo_i+n);
131   d_j.assign(coo_j,coo_j+n);
132   delete cusp->coo_p;
133   delete cusp->coo_pw;
134   cusp->coo_p = NULL;
135   cusp->coo_pw = NULL;
136   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
137   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
138   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
139   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
140     cusp->coo_p = new THRUSTINTARRAY(n);
141     cusp->coo_pw = new THRUSTARRAY(n);
142     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
143     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
144     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
145     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
146     firstoffd = mzipp.get_iterator_tuple().get<1>();
147   }
148   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
149   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
150 
151   /* from global to local */
152   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
153   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
154   cerr = WaitForCUDA();CHKERRCUDA(cerr);
155   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
156 
157   /* copy offdiag column indices to map on the CPU */
158   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
159   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
160   auto o_j = d_j.begin();
161   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
162   thrust::advance(o_j,cusp->coo_nd);
163   thrust::sort(thrust::device,o_j,d_j.end());
164   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
166   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
167   noff = thrust::distance(o_j,wit);
168   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
169   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
170   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
171   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
172   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
173   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
174   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
175   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
176 
177   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
178   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
179   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
180   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
181   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
182   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
183   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
184   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
185 
186   /* GPU memory, cusparse specific call handles it internally */
187   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
188   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
189   ierr = PetscFree(jj);CHKERRQ(ierr);
190 
191   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
192   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
193   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
194   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
195   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
196   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
197   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
198   B->preallocated = PETSC_TRUE;
199   B->nonzerostate++;
200   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
201 
202   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
203   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
204   B->offloadmask = PETSC_OFFLOAD_CPU;
205   B->assembled = PETSC_FALSE;
206   B->was_assembled = PETSC_FALSE;
207   PetscFunctionReturn(0);
208 }
209 
210 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
211 {
212   Mat            Ad,Ao;
213   const PetscInt *cmap;
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
218   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
219   if (glob) {
220     PetscInt cst, i, dn, on, *gidx;
221 
222     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
223     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
224     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
225     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
226     for (i=0; i<dn; i++) gidx[i]    = cst + i;
227     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
228     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
234 {
235   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
236   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
237   PetscErrorCode     ierr;
238   PetscInt           i;
239 
240   PetscFunctionBegin;
241   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
242   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
243   if (PetscDefined(USE_DEBUG) && d_nnz) {
244     for (i=0; i<B->rmap->n; i++) {
245       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
246     }
247   }
248   if (PetscDefined(USE_DEBUG) && o_nnz) {
249     for (i=0; i<B->rmap->n; i++) {
250       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
251     }
252   }
253   if (!B->preallocated) {
254     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
255     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
256     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
257     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
258     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
259     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
260     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
261     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
262     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
263   }
264   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
265   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
266   if (b->lvec) {
267     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
268   }
269   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
270   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
271   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
272   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
273   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
274   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
276   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
277 
278   B->preallocated = PETSC_TRUE;
279   PetscFunctionReturn(0);
280 }
281 
282 typedef struct {
283   Mat         *mp;    /* intermediate products */
284   PetscBool   *mptmp; /* is the intermediate product temporary ? */
285   PetscInt    cp;     /* number of intermediate products */
286 
287   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
288   PetscInt    *startsj_s,*startsj_r;
289   PetscScalar *bufa;
290   Mat         P_oth;
291 
292   /* may take advantage of merging product->B */
293   Mat Bloc;
294 
295   /* cusparse does not have support to split between symbolic and numeric phases
296      When api_user is true, we don't need to update the numerical values
297      of the temporary storage */
298   PetscBool reusesym;
299 
300   /* support for COO values insertion */
301   PetscScalar *coo_v,*coo_w;
302   PetscSF     sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */
303 } MatMatMPIAIJCUSPARSE;
304 
305 PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data)
306 {
307   MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data;
308   PetscInt             i;
309   PetscErrorCode       ierr;
310 
311   PetscFunctionBegin;
312   ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr);
313   ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr);
314   ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr);
315   ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr);
316   ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr);
317   ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr);
318   ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr);
319   for (i = 0; i < mmdata->cp; i++) {
320     ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr);
321   }
322   ierr = PetscFree(mmdata->mp);CHKERRQ(ierr);
323   ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr);
324   ierr = PetscFree(mmdata);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
329 {
330   MatMatMPIAIJCUSPARSE *mmdata;
331   PetscScalar          *tmp;
332   PetscInt             i,n;
333   PetscErrorCode       ierr;
334 
335   PetscFunctionBegin;
336   MatCheckProduct(C,1);
337   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
338   mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data;
339   tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v;
340   if (!mmdata->reusesym) { /* update temporary matrices */
341     if (mmdata->P_oth) {
342       ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
343     }
344     if (mmdata->Bloc) {
345       ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr);
346     }
347   }
348   mmdata->reusesym = PETSC_FALSE;
349   for (i = 0, n = 0; i < mmdata->cp; i++) {
350     Mat_SeqAIJ        *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data;
351     const PetscScalar *vv;
352 
353     if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]);
354     ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr);
355     if (mmdata->mptmp[i]) continue;
356     /* TODO: add support for using GPU data directly */
357     ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
358     ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr);
359     ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
360     n   += mm->nz;
361   }
362   if (mmdata->sf) { /* offprocess insertion */
363     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
364     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
365   }
366   ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 /* Pt * A or A * P */
371 #define MAX_NUMBER_INTERMEDIATE 4
372 static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
373 {
374   Mat_Product            *product = C->product;
375   Mat                    A,P,mp[MAX_NUMBER_INTERMEDIATE];
376   Mat_MPIAIJ             *a,*p;
377   MatMatMPIAIJCUSPARSE   *mmdata;
378   ISLocalToGlobalMapping P_oth_l2g = NULL;
379   IS                     glob = NULL;
380   const PetscInt         *globidx,*P_oth_idx;
381   const PetscInt         *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE];
382   PetscInt               cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j;
383   MatProductType         ptype;
384   PetscBool              mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE;
385   PetscMPIInt            size;
386   PetscErrorCode         ierr;
387 
388   PetscFunctionBegin;
389   MatCheckProduct(C,1);
390   if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
391   ptype = product->type;
392   if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
393   switch (ptype) {
394   case MATPRODUCT_AB:
395     A = product->A;
396     P = product->B;
397     m = A->rmap->n;
398     n = P->cmap->n;
399     M = A->rmap->N;
400     N = P->cmap->N;
401     break;
402   case MATPRODUCT_AtB:
403     P = product->A;
404     A = product->B;
405     m = P->cmap->n;
406     n = A->cmap->n;
407     M = P->cmap->N;
408     N = A->cmap->N;
409     hasoffproc = PETSC_TRUE;
410     break;
411   case MATPRODUCT_PtAP:
412     A = product->A;
413     P = product->B;
414     m = P->cmap->n;
415     n = P->cmap->n;
416     M = P->cmap->N;
417     N = P->cmap->N;
418     hasoffproc = PETSC_TRUE;
419     break;
420   default:
421     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
422   }
423   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr);
424   if (size == 1) hasoffproc = PETSC_FALSE;
425 
426   for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) {
427     mp[i]    = NULL;
428     mptmp[i] = PETSC_FALSE;
429     rmapt[i] = 0;
430     cmapt[i] = 0;
431     rmapa[i] = NULL;
432     cmapa[i] = NULL;
433   }
434 
435   a = (Mat_MPIAIJ*)A->data;
436   p = (Mat_MPIAIJ*)P->data;
437   ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr);
438   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
439   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
440   ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
441   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
442   mmdata->reusesym = product->api_user;
443   switch (ptype) {
444   case MATPRODUCT_AB: /* A * P */
445     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
446 
447     if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */
448       /* P is product->B */
449       ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
450       ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
451       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
452       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
453       mp[cp]->product->api_user = product->api_user;
454       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
455       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
456       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
457       ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
458       rmapt[cp] = 1;
459       cmapt[cp] = 2;
460       cmapa[cp] = globidx;
461       mptmp[cp] = PETSC_FALSE;
462       cp++;
463     } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */
464       ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr);
465       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
466       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
467       mp[cp]->product->api_user = product->api_user;
468       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
469       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
470       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
471       rmapt[cp] = 1;
472       cmapt[cp] = 1;
473       mptmp[cp] = PETSC_FALSE;
474       cp++;
475       ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr);
476       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
477       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
478       mp[cp]->product->api_user = product->api_user;
479       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
480       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
481       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
482       rmapt[cp] = 1;
483       cmapt[cp] = 2;
484       cmapa[cp] = p->garray;
485       mptmp[cp] = PETSC_FALSE;
486       cp++;
487     }
488     if (mmdata->P_oth) {
489       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
490       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
491       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
492       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
493       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
494       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
495       mp[cp]->product->api_user = product->api_user;
496       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
497       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
498       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
499       rmapt[cp] = 1;
500       cmapt[cp] = 2;
501       cmapa[cp] = P_oth_idx;
502       mptmp[cp] = PETSC_FALSE;
503       cp++;
504     }
505     break;
506   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
507     /* A is product->B */
508     ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
509     ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
510     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
511     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
512     mp[cp]->product->api_user = product->api_user;
513     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
514     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
515     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
516     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
517     rmapt[cp] = 1;
518     cmapt[cp] = 2;
519     cmapa[cp] = globidx;
520     mptmp[cp] = PETSC_FALSE;
521     cp++;
522     ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
523     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
524     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
525     mp[cp]->product->api_user = product->api_user;
526     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
527     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
528     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
529     rmapt[cp] = 2;
530     rmapa[cp] = p->garray;
531     cmapt[cp] = 2;
532     cmapa[cp] = globidx;
533     mptmp[cp] = PETSC_FALSE;
534     cp++;
535     break;
536   case MATPRODUCT_PtAP:
537     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
538     /* P is product->B */
539     ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
540     ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
541     ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr);
542     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
543     mp[cp]->product->api_user = product->api_user;
544     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
545     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
546     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
547     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
548     rmapt[cp] = 2;
549     rmapa[cp] = globidx;
550     cmapt[cp] = 2;
551     cmapa[cp] = globidx;
552     mptmp[cp] = PETSC_FALSE;
553     cp++;
554     if (mmdata->P_oth) {
555       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
556       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
557       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
558       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
559       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
560       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
561       mp[cp]->product->api_user = product->api_user;
562       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
563       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
564       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
565       mptmp[cp] = PETSC_TRUE;
566       cp++;
567       ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr);
568       ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
569       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
570       mp[cp]->product->api_user = product->api_user;
571       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
572       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
573       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
574       rmapt[cp] = 2;
575       rmapa[cp] = globidx;
576       cmapt[cp] = 2;
577       cmapa[cp] = P_oth_idx;
578       mptmp[cp] = PETSC_FALSE;
579       cp++;
580     }
581     break;
582   default:
583     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
584   }
585   ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr);
586   for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i];
587   ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr);
588   for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i];
589   mmdata->cp = cp;
590   C->product->data       = mmdata;
591   C->product->destroy    = MatDestroy_MatMatMPIAIJCUSPARSE;
592   C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
593 
594   /* prepare coo coordinates for values insertion */
595   ncoo = 0;
596   for (cp = 0; cp < mmdata->cp; cp++) {
597     Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data;
598     if (mptmp[cp]) continue;
599     ncoo += mm->nz;
600   }
601   ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr);
602   ncoo = 0;
603   for (cp = 0; cp < mmdata->cp; cp++) {
604     Mat_SeqAIJ     *mm = (Mat_SeqAIJ*)mp[cp]->data;
605     PetscInt       *coi = coo_i + ncoo;
606     PetscInt       *coj = coo_j + ncoo;
607     const PetscInt mr = mp[cp]->rmap->n;
608     const PetscInt *jj  = mm->j;
609     const PetscInt *ii  = mm->i;
610 
611     if (mptmp[cp]) continue;
612     /* rows coo */
613     if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen");
614     else if (rmapt[cp] == 1) { /* local to global for owned rows of  */
615       const PetscInt rs = C->rmap->rstart;
616       for (i = 0; i < mr; i++) {
617         const PetscInt gr = i + rs;
618         for (j = ii[i]; j < ii[i+1]; j++) {
619           coi[j] = gr;
620         }
621       }
622     } else { /* offprocess */
623       const PetscInt *rmap = rmapa[cp];
624       for (i = 0; i < mr; i++) {
625         const PetscInt gr = rmap[i];
626         for (j = ii[i]; j < ii[i+1]; j++) {
627           coi[j] = gr;
628         }
629       }
630     }
631     /* columns coo */
632     if (!cmapt[cp]) {
633       ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr);
634     } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */
635       const PetscInt cs = P->cmap->rstart;
636       for (j = 0; j < mm->nz; j++) {
637         coj[j] = jj[j] + cs;
638       }
639     } else { /* offdiag */
640       const PetscInt *cmap = cmapa[cp];
641       for (j = 0; j < mm->nz; j++) {
642         coj[j] = cmap[jj[j]];
643       }
644     }
645     ncoo += mm->nz;
646   }
647   if (glob) {
648     ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr);
649   }
650   ierr = ISDestroy(&glob);CHKERRQ(ierr);
651   if (P_oth_l2g) {
652     ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
653   }
654   ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr);
655 
656   if (hasoffproc) { /* offproc values insertion */
657     const PetscInt *sfdeg;
658     const PetscInt n = P->cmap->n;
659     PetscInt ncoo2,*coo_i2,*coo_j2;
660 
661     ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr);
662     ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr);
663     ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr);
664     ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr);
665     for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i];
666     ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr);
667     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
668     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
669     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
670     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
671     ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
672     ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr);
673     coo_i = coo_i2;
674     coo_j = coo_j2;
675     ncoo  = ncoo2;
676   }
677 
678   /* preallocate with COO data */
679   ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr);
680   ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr);
681   ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat)
686 {
687   Mat_Product    *product = mat->product;
688   PetscErrorCode ierr;
689   PetscBool      Biscusp = PETSC_FALSE;
690 
691   PetscFunctionBegin;
692   MatCheckProduct(mat,1);
693   if (!product->B->boundtocpu) {
694     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
695   }
696   if (Biscusp) {
697     switch (product->type) {
698     case MATPRODUCT_AB:
699     case MATPRODUCT_AtB:
700     case MATPRODUCT_PtAP:
701       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
702       break;
703     default:
704       break;
705     }
706   }
707   if (!mat->ops->productsymbolic) {
708     ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr);
709   }
710   PetscFunctionReturn(0);
711 }
712 
713 /*@
714    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
715 
716    Not collective
717 
718    Input Parameters:
719 +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
720 -  gen - the boolean flag
721 
722    Level: intermediate
723 
724 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
725 @*/
726 PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
727 {
728   PetscErrorCode ierr;
729   PetscBool      ismpiaij;
730 
731   PetscFunctionBegin;
732   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
733   MatCheckPreallocated(A,1);
734   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
735   if (ismpiaij) {
736     Mat A_d,A_o;
737 
738     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
739     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
740     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
741   } else {
742     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
743   }
744   PetscFunctionReturn(0);
745 }
746 
747 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
748 {
749   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
750   PetscErrorCode ierr;
751   PetscInt       nt;
752 
753   PetscFunctionBegin;
754   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
755   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
756   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
757   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
758   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
759   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
760   PetscFunctionReturn(0);
761 }
762 
763 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
764 {
765   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
766   PetscErrorCode ierr;
767 
768   PetscFunctionBegin;
769   if (A->factortype == MAT_FACTOR_NONE) {
770     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
771     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
772     if (d_mat) {
773       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
774       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
775       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
776       cudaError_t  err;
777       PetscScalar  *vals;
778       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
779       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
780       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
781       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
782       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
783     }
784   }
785   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
786   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
787 
788   PetscFunctionReturn(0);
789 }
790 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
791 {
792   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
793   PetscErrorCode ierr;
794   PetscInt       nt;
795 
796   PetscFunctionBegin;
797   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
798   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
799   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
800   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
801   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
802   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
803   PetscFunctionReturn(0);
804 }
805 
806 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
807 {
808   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
809   PetscErrorCode ierr;
810   PetscInt       nt;
811 
812   PetscFunctionBegin;
813   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
814   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
815   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
816   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
817   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
818   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
819   PetscFunctionReturn(0);
820 }
821 
822 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
823 {
824   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
825   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
826 
827   PetscFunctionBegin;
828   switch (op) {
829   case MAT_CUSPARSE_MULT_DIAG:
830     cusparseStruct->diagGPUMatFormat = format;
831     break;
832   case MAT_CUSPARSE_MULT_OFFDIAG:
833     cusparseStruct->offdiagGPUMatFormat = format;
834     break;
835   case MAT_CUSPARSE_ALL:
836     cusparseStruct->diagGPUMatFormat    = format;
837     cusparseStruct->offdiagGPUMatFormat = format;
838     break;
839   default:
840     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
841   }
842   PetscFunctionReturn(0);
843 }
844 
845 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
846 {
847   MatCUSPARSEStorageFormat format;
848   PetscErrorCode           ierr;
849   PetscBool                flg;
850   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
851   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
852 
853   PetscFunctionBegin;
854   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
855   if (A->factortype==MAT_FACTOR_NONE) {
856     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
857                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
858     if (flg) {
859       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
860     }
861     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
862                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
863     if (flg) {
864       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
865     }
866     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
867                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
868     if (flg) {
869       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
870     }
871   }
872   ierr = PetscOptionsTail();CHKERRQ(ierr);
873   PetscFunctionReturn(0);
874 }
875 
876 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
877 {
878   PetscErrorCode             ierr;
879   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
880   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
881   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
882   PetscFunctionBegin;
883   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
884   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
885     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
886   }
887   if (d_mat) {
888     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
889   }
890 
891   PetscFunctionReturn(0);
892 }
893 
894 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
895 {
896   PetscErrorCode     ierr;
897   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
898   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
899   cudaError_t        err;
900   cusparseStatus_t   stat;
901 
902   PetscFunctionBegin;
903   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
904   if (cusparseStruct->deviceMat) {
905     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
906     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
907     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
908     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
909     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
910     if (jaca->compressedrow.use) {
911       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
912     }
913     if (jacb->compressedrow.use) {
914       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
915     }
916     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
917     err = cudaFree(d_mat);CHKERRCUDA(err);
918   }
919   try {
920     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
921     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
922     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
923     if (cusparseStruct->stream) {
924       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
925     }
926     delete cusparseStruct->coo_p;
927     delete cusparseStruct->coo_pw;
928     delete cusparseStruct;
929   } catch(char *ex) {
930     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
931   }
932   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
933   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
934   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
935   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
936   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
937   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
938   PetscFunctionReturn(0);
939 }
940 
941 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
942 {
943   PetscErrorCode     ierr;
944   Mat_MPIAIJ         *a;
945   Mat_MPIAIJCUSPARSE *cusparseStruct;
946   cusparseStatus_t   stat;
947   Mat                A;
948 
949   PetscFunctionBegin;
950   if (reuse == MAT_INITIAL_MATRIX) {
951     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
952   } else if (reuse == MAT_REUSE_MATRIX) {
953     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
954   }
955   A = *newmat;
956 
957   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
958   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
959 
960   a = (Mat_MPIAIJ*)A->data;
961   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
962   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
963   if (a->lvec) {
964     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
965   }
966 
967   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
968     a->spptr = new Mat_MPIAIJCUSPARSE;
969 
970     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
971     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
972     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
973     cusparseStruct->coo_p               = NULL;
974     cusparseStruct->coo_pw              = NULL;
975     cusparseStruct->stream              = 0;
976     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
977     cusparseStruct->deviceMat = NULL;
978   }
979 
980   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
981   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
982   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
983   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
984   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
985   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
986   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
987   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE;
988 
989   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
990   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
991   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
992   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
993   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
994   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
995   PetscFunctionReturn(0);
996 }
997 
998 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
999 {
1000   PetscErrorCode ierr;
1001 
1002   PetscFunctionBegin;
1003   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
1004   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
1005   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1006   PetscFunctionReturn(0);
1007 }
1008 
1009 /*@
1010    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1011    (the default parallel PETSc format).  This matrix will ultimately pushed down
1012    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1013    assembly performance the user should preallocate the matrix storage by setting
1014    the parameter nz (or the array nnz).  By setting these parameters accurately,
1015    performance during matrix assembly can be increased by more than a factor of 50.
1016 
1017    Collective
1018 
1019    Input Parameters:
1020 +  comm - MPI communicator, set to PETSC_COMM_SELF
1021 .  m - number of rows
1022 .  n - number of columns
1023 .  nz - number of nonzeros per row (same for all rows)
1024 -  nnz - array containing the number of nonzeros in the various rows
1025          (possibly different for each row) or NULL
1026 
1027    Output Parameter:
1028 .  A - the matrix
1029 
1030    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1031    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1032    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1033 
1034    Notes:
1035    If nnz is given then nz is ignored
1036 
1037    The AIJ format (also called the Yale sparse matrix format or
1038    compressed row storage), is fully compatible with standard Fortran 77
1039    storage.  That is, the stored row and column indices can begin at
1040    either one (as in Fortran) or zero.  See the users' manual for details.
1041 
1042    Specify the preallocated storage with either nz or nnz (not both).
1043    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1044    allocation.  For large problems you MUST preallocate memory or you
1045    will get TERRIBLE performance, see the users' manual chapter on matrices.
1046 
1047    By default, this format uses inodes (identical nodes) when possible, to
1048    improve numerical efficiency of matrix-vector products and solves. We
1049    search for consecutive rows with the same nonzero structure, thereby
1050    reusing matrix information to achieve increased efficiency.
1051 
1052    Level: intermediate
1053 
1054 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
1055 @*/
1056 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
1057 {
1058   PetscErrorCode ierr;
1059   PetscMPIInt    size;
1060 
1061   PetscFunctionBegin;
1062   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1063   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
1064   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1065   if (size > 1) {
1066     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
1067     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1068   } else {
1069     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1070     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
1071   }
1072   PetscFunctionReturn(0);
1073 }
1074 
1075 /*MC
1076    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
1077 
1078    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1079    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1080    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1081 
1082    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
1083    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
1084    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
1085    for communicators controlling multiple processes.  It is recommended that you call both of
1086    the above preallocation routines for simplicity.
1087 
1088    Options Database Keys:
1089 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
1090 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1091 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1092 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1093 
1094   Level: beginner
1095 
1096  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1097 M
1098 M*/
1099 
1100 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1101 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
1102 {
1103 #if defined(PETSC_USE_CTABLE)
1104   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
1105 #else
1106   PetscSplitCSRDataStructure **p_d_mat;
1107   PetscMPIInt                size,rank;
1108   MPI_Comm                   comm;
1109   PetscErrorCode             ierr;
1110   int                        *ai,*bi,*aj,*bj;
1111   PetscScalar                *aa,*ba;
1112 
1113   PetscFunctionBegin;
1114   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1115   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1116   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
1117   if (A->factortype == MAT_FACTOR_NONE) {
1118     CsrMatrix *matrixA,*matrixB=NULL;
1119     if (size == 1) {
1120       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1121       p_d_mat = &cusparsestruct->deviceMat;
1122       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1123       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1124         matrixA = (CsrMatrix*)matstruct->mat;
1125         bi = bj = NULL; ba = NULL;
1126       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
1127     } else {
1128       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
1129       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
1130       p_d_mat = &spptr->deviceMat;
1131       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
1132       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
1133       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
1134       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
1135       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
1136         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
1137         matrixA = (CsrMatrix*)matstructA->mat;
1138         matrixB = (CsrMatrix*)matstructB->mat;
1139         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
1140         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
1141         ba = thrust::raw_pointer_cast(matrixB->values->data());
1142       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
1143     }
1144     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
1145     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
1146     aa = thrust::raw_pointer_cast(matrixA->values->data());
1147   } else {
1148     *B = NULL;
1149     PetscFunctionReturn(0);
1150   }
1151   // act like MatSetValues because not called on host
1152   if (A->assembled) {
1153     if (A->was_assembled) {
1154       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
1155     }
1156     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1157   } else {
1158     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
1159   }
1160   if (!*p_d_mat) {
1161     cudaError_t                 err;
1162     PetscSplitCSRDataStructure  *d_mat, h_mat;
1163     Mat_SeqAIJ                  *jaca;
1164     PetscInt                    n = A->rmap->n, nnz;
1165     // create and copy
1166     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
1167     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
1168     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
1169     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
1170     if (size == 1) {
1171       jaca = (Mat_SeqAIJ*)A->data;
1172       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1173       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1174       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1175       h_mat.offdiag.a = NULL;
1176     } else {
1177       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
1178       Mat_SeqAIJ  *jacb;
1179       jaca = (Mat_SeqAIJ*)aij->A->data;
1180       jacb = (Mat_SeqAIJ*)aij->B->data;
1181       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
1182       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
1183       // create colmap - this is ussually done (lazy) in MatSetValues
1184       aij->donotstash = PETSC_TRUE;
1185       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1186       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
1187       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
1188       aij->colmap[A->cmap->N] = -9;
1189       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
1190       {
1191         PetscInt ii;
1192         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
1193       }
1194       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
1195       // allocate B copy data
1196       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1197       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1198       nnz = jacb->i[n];
1199 
1200       if (jacb->compressedrow.use) {
1201         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
1202         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1203       } else h_mat.offdiag.i = bi;
1204       h_mat.offdiag.j = bj;
1205       h_mat.offdiag.a = ba;
1206 
1207       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
1208       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1209       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1210       h_mat.offdiag.n = n;
1211     }
1212     // allocate A copy data
1213     nnz = jaca->i[n];
1214     h_mat.diag.n = n;
1215     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1216     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
1217     if (jaca->compressedrow.use) {
1218       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
1219       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1220     } else {
1221       h_mat.diag.i = ai;
1222     }
1223     h_mat.diag.j = aj;
1224     h_mat.diag.a = aa;
1225     // copy pointers and metdata to device
1226     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1227     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
1228   } else {
1229     *B = *p_d_mat;
1230   }
1231   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1232   PetscFunctionReturn(0);
1233 #endif
1234 }
1235