xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 80fdaca06e24329b048de2e274d60da818b72e9c)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28   cudaError_t        cerr;
29 
30   PetscFunctionBegin;
31   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
32   if (cusp->coo_p && v) {
33     thrust::device_ptr<const PetscScalar> d_v;
34     THRUSTARRAY                           *w = NULL;
35 
36     if (isCudaMem(v)) {
37       d_v = thrust::device_pointer_cast(v);
38     } else {
39       w = new THRUSTARRAY(n);
40       w->assign(v,v+n);
41       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
42       d_v = w->data();
43     }
44 
45     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
46                                                               cusp->coo_pw->begin()));
47     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
48                                                               cusp->coo_pw->end()));
49     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
50     thrust::for_each(zibit,zieit,VecCUDAEquals());
51     cerr = WaitForCUDA();CHKERRCUDA(cerr);
52     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
53     delete w;
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
55     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
56   } else {
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
58     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
59   }
60   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
61   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
62   A->num_ass++;
63   A->assembled        = PETSC_TRUE;
64   A->ass_nonzerostate = A->nonzerostate;
65   A->offloadmask      = PETSC_OFFLOAD_GPU;
66   PetscFunctionReturn(0);
67 }
68 
69 template <typename Tuple>
70 struct IsNotOffDiagT
71 {
72   PetscInt _cstart,_cend;
73 
74   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
75   __host__ __device__
76   inline bool operator()(Tuple t)
77   {
78     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
79   }
80 };
81 
82 struct IsOffDiag
83 {
84   PetscInt _cstart,_cend;
85 
86   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
87   __host__ __device__
88   inline bool operator() (const PetscInt &c)
89   {
90     return c < _cstart || c >= _cend;
91   }
92 };
93 
94 struct GlobToLoc
95 {
96   PetscInt _start;
97 
98   GlobToLoc(PetscInt start) : _start(start) {}
99   __host__ __device__
100   inline PetscInt operator() (const PetscInt &c)
101   {
102     return c - _start;
103   }
104 };
105 
106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
107 {
108   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
109   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
110   PetscErrorCode         ierr;
111   PetscInt               *jj;
112   size_t                 noff = 0;
113   THRUSTINTARRAY         d_i(n);
114   THRUSTINTARRAY         d_j(n);
115   ISLocalToGlobalMapping l2g;
116   cudaError_t            cerr;
117 
118   PetscFunctionBegin;
119   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
120   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
121   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
122   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
123   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
124   ierr = PetscFree(b->garray);CHKERRQ(ierr);
125   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
126   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
127   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
128 
129   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
130   d_i.assign(coo_i,coo_i+n);
131   d_j.assign(coo_j,coo_j+n);
132   delete cusp->coo_p;
133   delete cusp->coo_pw;
134   cusp->coo_p = NULL;
135   cusp->coo_pw = NULL;
136   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
137   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
138   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
139   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
140     cusp->coo_p = new THRUSTINTARRAY(n);
141     cusp->coo_pw = new THRUSTARRAY(n);
142     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
143     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
144     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
145     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
146     firstoffd = mzipp.get_iterator_tuple().get<1>();
147   }
148   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
149   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
150 
151   /* from global to local */
152   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
153   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
154   cerr = WaitForCUDA();CHKERRCUDA(cerr);
155   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
156 
157   /* copy offdiag column indices to map on the CPU */
158   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
159   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
160   auto o_j = d_j.begin();
161   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
162   thrust::advance(o_j,cusp->coo_nd);
163   thrust::sort(thrust::device,o_j,d_j.end());
164   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
166   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
167   noff = thrust::distance(o_j,wit);
168   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
169   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
170   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
171   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
172   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
173   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
174   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
175   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
176 
177   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
178   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
179   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
180   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
181   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
182   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
183   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
184   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
185 
186   /* GPU memory, cusparse specific call handles it internally */
187   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
188   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
189   ierr = PetscFree(jj);CHKERRQ(ierr);
190 
191   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
192   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
193   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
194   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
195   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
196   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
197   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
198   B->preallocated = PETSC_TRUE;
199   B->nonzerostate++;
200   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
201 
202   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
203   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
204   B->offloadmask = PETSC_OFFLOAD_CPU;
205   B->assembled = PETSC_FALSE;
206   B->was_assembled = PETSC_FALSE;
207   PetscFunctionReturn(0);
208 }
209 
210 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
211 {
212   Mat            Ad,Ao;
213   const PetscInt *cmap;
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
218   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
219   if (glob) {
220     PetscInt cst, i, dn, on, *gidx;
221 
222     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
223     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
224     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
225     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
226     for (i=0; i<dn; i++) gidx[i]    = cst + i;
227     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
228     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
234 {
235   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
236   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
237   PetscErrorCode     ierr;
238   PetscInt           i;
239 
240   PetscFunctionBegin;
241   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
242   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
243   if (PetscDefined(USE_DEBUG) && d_nnz) {
244     for (i=0; i<B->rmap->n; i++) {
245       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
246     }
247   }
248   if (PetscDefined(USE_DEBUG) && o_nnz) {
249     for (i=0; i<B->rmap->n; i++) {
250       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
251     }
252   }
253   if (!B->preallocated) {
254     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
255     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
256     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
257     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
258     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
259     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
260     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
261     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
262     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
263   }
264   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
265   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
266   if (b->lvec) {
267     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
268   }
269   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
270   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
271   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
272   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
273   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
274   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
275   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
276   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
277 
278   B->preallocated = PETSC_TRUE;
279   PetscFunctionReturn(0);
280 }
281 
282 typedef struct {
283   Mat         *mp;    /* intermediate products */
284   PetscBool   *mptmp; /* is the intermediate product temporary ? */
285   PetscInt    cp;     /* number of intermediate products */
286 
287   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
288   PetscInt    *startsj_s,*startsj_r;
289   PetscScalar *bufa;
290   Mat         P_oth;
291 
292   /* may take advantage of merging product->B */
293   Mat Bloc;
294 
295   /* cusparse does not have support to split between symbolic and numeric phases
296      When api_user is true, we don't need to update the numerical values
297      of the temporary storage */
298   PetscBool reusesym;
299 
300   /* support for COO values insertion */
301   PetscScalar *coo_v,*coo_w;
302   PetscSF     sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */
303 } MatMatMPIAIJCUSPARSE;
304 
305 PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data)
306 {
307   MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data;
308   PetscInt             i;
309   PetscErrorCode       ierr;
310 
311   PetscFunctionBegin;
312   ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr);
313   ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr);
314   ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr);
315   ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr);
316   ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr);
317   ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr);
318   ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr);
319   for (i = 0; i < mmdata->cp; i++) {
320     ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr);
321   }
322   ierr = PetscFree(mmdata->mp);CHKERRQ(ierr);
323   ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr);
324   ierr = PetscFree(mmdata);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
329 {
330   MatMatMPIAIJCUSPARSE *mmdata;
331   PetscScalar          *tmp;
332   PetscInt             i,n;
333   PetscErrorCode       ierr;
334 
335   PetscFunctionBegin;
336   MatCheckProduct(C,1);
337   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
338   mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data;
339   tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v;
340   if (!mmdata->reusesym) { /* update temporary matrices */
341     if (mmdata->P_oth) {
342       ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
343     }
344     if (mmdata->Bloc) {
345       ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr);
346     }
347   }
348   mmdata->reusesym = PETSC_FALSE;
349   for (i = 0, n = 0; i < mmdata->cp; i++) {
350     Mat_SeqAIJ        *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data;
351     const PetscScalar *vv;
352 
353     if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]);
354     ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr);
355     if (mmdata->mptmp[i]) continue;
356     /* TODO: add support for using GPU data directly */
357     ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
358     ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr);
359     ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
360     n   += mm->nz;
361   }
362   if (mmdata->sf) { /* offprocess insertion */
363     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
364     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
365   }
366   ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 /* Pt * A or A * P */
371 #define MAX_NUMBER_INTERMEDIATE 4
372 static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
373 {
374   Mat_Product            *product = C->product;
375   Mat                    A,P,mp[MAX_NUMBER_INTERMEDIATE];
376   Mat_MPIAIJ             *a,*p;
377   MatMatMPIAIJCUSPARSE   *mmdata;
378   ISLocalToGlobalMapping P_oth_l2g = NULL;
379   IS                     glob = NULL;
380   const PetscInt         *globidx,*P_oth_idx;
381   const PetscInt         *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE];
382   PetscInt               cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j;
383   MatProductType         ptype;
384   PetscBool              mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE;
385   PetscMPIInt            size;
386   PetscErrorCode         ierr;
387 
388   PetscFunctionBegin;
389   MatCheckProduct(C,1);
390   if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
391   ptype = product->type;
392   if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
393   switch (ptype) {
394   case MATPRODUCT_AB:
395     A = product->A;
396     P = product->B;
397     m = A->rmap->n;
398     n = P->cmap->n;
399     M = A->rmap->N;
400     N = P->cmap->N;
401     break;
402   case MATPRODUCT_AtB:
403     P = product->A;
404     A = product->B;
405     m = P->cmap->n;
406     n = A->cmap->n;
407     M = P->cmap->N;
408     N = A->cmap->N;
409     hasoffproc = PETSC_TRUE;
410     break;
411   case MATPRODUCT_PtAP:
412     A = product->A;
413     P = product->B;
414     m = P->cmap->n;
415     n = P->cmap->n;
416     M = P->cmap->N;
417     N = P->cmap->N;
418     hasoffproc = PETSC_TRUE;
419     break;
420   default:
421     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
422   }
423   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr);
424   if (size == 1) hasoffproc = PETSC_FALSE;
425 
426   for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) {
427     mp[i]    = NULL;
428     mptmp[i] = PETSC_FALSE;
429     rmapt[i] = 0;
430     cmapt[i] = 0;
431     rmapa[i] = NULL;
432     cmapa[i] = NULL;
433   }
434 
435   a = (Mat_MPIAIJ*)A->data;
436   p = (Mat_MPIAIJ*)P->data;
437   ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr);
438   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
439   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
440   ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
441   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
442   mmdata->reusesym = product->api_user;
443   switch (ptype) {
444   case MATPRODUCT_AB: /* A * P */
445     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
446 
447     if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */
448       /* P is product->B */
449       ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
450       ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
451       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
452       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
453       mp[cp]->product->api_user = product->api_user;
454       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
455       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
456       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
457       ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
458       rmapt[cp] = 1;
459       cmapt[cp] = 2;
460       cmapa[cp] = globidx;
461       mptmp[cp] = PETSC_FALSE;
462       cp++;
463     } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */
464       ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr);
465       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
466       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
467       mp[cp]->product->api_user = product->api_user;
468       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
469       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
470       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
471       rmapt[cp] = 1;
472       cmapt[cp] = 1;
473       mptmp[cp] = PETSC_FALSE;
474       cp++;
475       ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr);
476       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
477       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
478       mp[cp]->product->api_user = product->api_user;
479       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
480       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
481       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
482       rmapt[cp] = 1;
483       cmapt[cp] = 2;
484       cmapa[cp] = p->garray;
485       mptmp[cp] = PETSC_FALSE;
486       cp++;
487     }
488     if (mmdata->P_oth) {
489       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
490       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
491       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
492       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
493       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
494       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
495       mp[cp]->product->api_user = product->api_user;
496       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
497       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
498       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
499       rmapt[cp] = 1;
500       cmapt[cp] = 2;
501       cmapa[cp] = P_oth_idx;
502       mptmp[cp] = PETSC_FALSE;
503       cp++;
504     }
505     break;
506   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
507     /* A is product->B */
508     ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
509     ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
510     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
511     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
512     mp[cp]->product->api_user = product->api_user;
513     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
514     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
515     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
516     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
517     rmapt[cp] = 1;
518     cmapt[cp] = 2;
519     cmapa[cp] = globidx;
520     mptmp[cp] = PETSC_FALSE;
521     cp++;
522     ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
523     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
524     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
525     mp[cp]->product->api_user = product->api_user;
526     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
527     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
528     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
529     rmapt[cp] = 2;
530     rmapa[cp] = p->garray;
531     cmapt[cp] = 2;
532     cmapa[cp] = globidx;
533     mptmp[cp] = PETSC_FALSE;
534     cp++;
535     break;
536   case MATPRODUCT_PtAP:
537     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
538     /* P is product->B */
539     ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
540     ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
541     ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr);
542     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
543     mp[cp]->product->api_user = product->api_user;
544     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
545     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
546     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
547     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
548     rmapt[cp] = 2;
549     rmapa[cp] = globidx;
550     cmapt[cp] = 2;
551     cmapa[cp] = globidx;
552     mptmp[cp] = PETSC_FALSE;
553     cp++;
554     if (mmdata->P_oth) {
555       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
556       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
557       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
558       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
559       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
560       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
561       mp[cp]->product->api_user = product->api_user;
562       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
563       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
564       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
565       mptmp[cp] = PETSC_TRUE;
566       cp++;
567       ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr);
568       ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
569       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
570       mp[cp]->product->api_user = product->api_user;
571       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
572       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
573       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
574       rmapt[cp] = 2;
575       rmapa[cp] = globidx;
576       cmapt[cp] = 2;
577       cmapa[cp] = P_oth_idx;
578       mptmp[cp] = PETSC_FALSE;
579       cp++;
580     }
581     break;
582   default:
583     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
584   }
585   ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr);
586   for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i];
587   ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr);
588   for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i];
589   mmdata->cp = cp;
590   C->product->data       = mmdata;
591   C->product->destroy    = MatDestroy_MatMatMPIAIJCUSPARSE;
592   C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
593 
594   /* prepare coo coordinates for values insertion */
595   ncoo = 0;
596   for (cp = 0; cp < mmdata->cp; cp++) {
597     Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data;
598     if (mptmp[cp]) continue;
599     ncoo += mm->nz;
600   }
601   ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr);
602   ncoo = 0;
603   for (cp = 0; cp < mmdata->cp; cp++) {
604     Mat_SeqAIJ     *mm = (Mat_SeqAIJ*)mp[cp]->data;
605     PetscInt       *coi = coo_i + ncoo;
606     PetscInt       *coj = coo_j + ncoo;
607     const PetscInt mr = mp[cp]->rmap->n;
608     const PetscInt *jj  = mm->j;
609     const PetscInt *ii  = mm->i;
610 
611     if (mptmp[cp]) continue;
612     /* rows coo */
613     if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen");
614     else if (rmapt[cp] == 1) { /* local to global for owned rows of  */
615       const PetscInt rs = C->rmap->rstart;
616       for (i = 0; i < mr; i++) {
617         const PetscInt gr = i + rs;
618         for (j = ii[i]; j < ii[i+1]; j++) {
619           coi[j] = gr;
620         }
621       }
622     } else { /* offprocess */
623       const PetscInt *rmap = rmapa[cp];
624       for (i = 0; i < mr; i++) {
625         const PetscInt gr = rmap[i];
626         for (j = ii[i]; j < ii[i+1]; j++) {
627           coi[j] = gr;
628         }
629       }
630     }
631     /* columns coo */
632     if (!cmapt[cp]) {
633       ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr);
634     } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */
635       const PetscInt cs = P->cmap->rstart;
636       for (j = 0; j < mm->nz; j++) {
637         coj[j] = jj[j] + cs;
638       }
639     } else { /* offdiag */
640       const PetscInt *cmap = cmapa[cp];
641       for (j = 0; j < mm->nz; j++) {
642         coj[j] = cmap[jj[j]];
643       }
644     }
645     ncoo += mm->nz;
646   }
647   if (glob) {
648     ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr);
649   }
650   ierr = ISDestroy(&glob);CHKERRQ(ierr);
651   if (P_oth_l2g) {
652     ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
653   }
654   ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr);
655 
656   if (hasoffproc) { /* offproc values insertion */
657     const PetscInt *sfdeg;
658     const PetscInt n = P->cmap->n;
659     PetscInt ncoo2,*coo_i2,*coo_j2;
660 
661     ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr);
662     ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr);
663     ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr);
664     ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr);
665     for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i];
666     ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr);
667     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
668     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
669     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
670     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
671     ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
672     ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr);
673     coo_i = coo_i2;
674     coo_j = coo_j2;
675     ncoo  = ncoo2;
676   }
677 
678   /* preallocate with COO data */
679   ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr);
680   ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr);
681   ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat)
686 {
687   Mat_Product    *product = mat->product;
688   PetscErrorCode ierr;
689   PetscBool      Biscusp = PETSC_FALSE;
690 
691   PetscFunctionBegin;
692   MatCheckProduct(mat,1);
693   if (!product->B->boundtocpu) {
694     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
695   }
696   if (Biscusp) {
697     switch (product->type) {
698     case MATPRODUCT_AB:
699     case MATPRODUCT_AtB:
700     case MATPRODUCT_PtAP:
701       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
702       break;
703     default:
704       break;
705     }
706   }
707   if (!mat->ops->productsymbolic) {
708     ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr);
709   }
710   PetscFunctionReturn(0);
711 }
712 
713 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
714 {
715   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
716   PetscErrorCode ierr;
717   PetscInt       nt;
718 
719   PetscFunctionBegin;
720   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
721   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
722   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
723   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
724   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
725   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 
729 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
730 {
731   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
732   PetscErrorCode ierr;
733 
734   PetscFunctionBegin;
735   if (A->factortype == MAT_FACTOR_NONE) {
736     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
737     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
738     if (d_mat) {
739       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
740       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
741       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
742       cudaError_t  err;
743       PetscScalar  *vals;
744       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
745       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
746       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
747       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
748       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
749     }
750   }
751   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
752   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
753 
754   PetscFunctionReturn(0);
755 }
756 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
757 {
758   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
759   PetscErrorCode ierr;
760   PetscInt       nt;
761 
762   PetscFunctionBegin;
763   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
764   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
765   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
766   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
767   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
768   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
769   PetscFunctionReturn(0);
770 }
771 
772 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
773 {
774   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
775   PetscErrorCode ierr;
776   PetscInt       nt;
777 
778   PetscFunctionBegin;
779   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
780   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
781   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
782   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
783   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
784   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
785   PetscFunctionReturn(0);
786 }
787 
788 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
789 {
790   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
791   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
792 
793   PetscFunctionBegin;
794   switch (op) {
795   case MAT_CUSPARSE_MULT_DIAG:
796     cusparseStruct->diagGPUMatFormat = format;
797     break;
798   case MAT_CUSPARSE_MULT_OFFDIAG:
799     cusparseStruct->offdiagGPUMatFormat = format;
800     break;
801   case MAT_CUSPARSE_ALL:
802     cusparseStruct->diagGPUMatFormat    = format;
803     cusparseStruct->offdiagGPUMatFormat = format;
804     break;
805   default:
806     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
807   }
808   PetscFunctionReturn(0);
809 }
810 
811 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
812 {
813   MatCUSPARSEStorageFormat format;
814   PetscErrorCode           ierr;
815   PetscBool                flg;
816   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
817   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
818 
819   PetscFunctionBegin;
820   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
821   if (A->factortype==MAT_FACTOR_NONE) {
822     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
823                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
824     if (flg) {
825       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
826     }
827     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
828                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
829     if (flg) {
830       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
831     }
832     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
833                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
834     if (flg) {
835       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
836     }
837   }
838   ierr = PetscOptionsTail();CHKERRQ(ierr);
839   PetscFunctionReturn(0);
840 }
841 
842 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
843 {
844   PetscErrorCode             ierr;
845   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
846   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
847   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
848   PetscFunctionBegin;
849   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
850   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
851     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
852   }
853   if (d_mat) {
854     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
855   }
856 
857   PetscFunctionReturn(0);
858 }
859 
860 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
861 {
862   PetscErrorCode     ierr;
863   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
864   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
865   cudaError_t        err;
866   cusparseStatus_t   stat;
867 
868   PetscFunctionBegin;
869   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
870   if (cusparseStruct->deviceMat) {
871     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
872     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
873     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
874     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
875     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
876     if (jaca->compressedrow.use) {
877       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
878     }
879     if (jacb->compressedrow.use) {
880       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
881     }
882     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
883     err = cudaFree(d_mat);CHKERRCUDA(err);
884   }
885   try {
886     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
887     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
888     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
889     if (cusparseStruct->stream) {
890       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
891     }
892     delete cusparseStruct->coo_p;
893     delete cusparseStruct->coo_pw;
894     delete cusparseStruct;
895   } catch(char *ex) {
896     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
897   }
898   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
899   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
900   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
901   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
902   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
903   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
904   PetscFunctionReturn(0);
905 }
906 
907 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
908 {
909   PetscErrorCode     ierr;
910   Mat_MPIAIJ         *a;
911   Mat_MPIAIJCUSPARSE *cusparseStruct;
912   cusparseStatus_t   stat;
913   Mat                A;
914 
915   PetscFunctionBegin;
916   if (reuse == MAT_INITIAL_MATRIX) {
917     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
918   } else if (reuse == MAT_REUSE_MATRIX) {
919     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
920   }
921   A = *newmat;
922 
923   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
924   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
925 
926   a = (Mat_MPIAIJ*)A->data;
927   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
928   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
929   if (a->lvec) {
930     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
931   }
932 
933   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
934     a->spptr = new Mat_MPIAIJCUSPARSE;
935 
936     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
937     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
938     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
939     cusparseStruct->coo_p               = NULL;
940     cusparseStruct->coo_pw              = NULL;
941     cusparseStruct->stream              = 0;
942     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
943     cusparseStruct->deviceMat = NULL;
944   }
945 
946   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
947   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
948   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
949   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
950   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
951   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
952   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
953   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE;
954 
955   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
956   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
957   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
958   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
959   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
960   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
961   PetscFunctionReturn(0);
962 }
963 
964 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
965 {
966   PetscErrorCode ierr;
967 
968   PetscFunctionBegin;
969   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
970   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
971   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
972   PetscFunctionReturn(0);
973 }
974 
975 /*@
976    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
977    (the default parallel PETSc format).  This matrix will ultimately pushed down
978    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
979    assembly performance the user should preallocate the matrix storage by setting
980    the parameter nz (or the array nnz).  By setting these parameters accurately,
981    performance during matrix assembly can be increased by more than a factor of 50.
982 
983    Collective
984 
985    Input Parameters:
986 +  comm - MPI communicator, set to PETSC_COMM_SELF
987 .  m - number of rows
988 .  n - number of columns
989 .  nz - number of nonzeros per row (same for all rows)
990 -  nnz - array containing the number of nonzeros in the various rows
991          (possibly different for each row) or NULL
992 
993    Output Parameter:
994 .  A - the matrix
995 
996    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
997    MatXXXXSetPreallocation() paradigm instead of this routine directly.
998    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
999 
1000    Notes:
1001    If nnz is given then nz is ignored
1002 
1003    The AIJ format (also called the Yale sparse matrix format or
1004    compressed row storage), is fully compatible with standard Fortran 77
1005    storage.  That is, the stored row and column indices can begin at
1006    either one (as in Fortran) or zero.  See the users' manual for details.
1007 
1008    Specify the preallocated storage with either nz or nnz (not both).
1009    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1010    allocation.  For large problems you MUST preallocate memory or you
1011    will get TERRIBLE performance, see the users' manual chapter on matrices.
1012 
1013    By default, this format uses inodes (identical nodes) when possible, to
1014    improve numerical efficiency of matrix-vector products and solves. We
1015    search for consecutive rows with the same nonzero structure, thereby
1016    reusing matrix information to achieve increased efficiency.
1017 
1018    Level: intermediate
1019 
1020 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
1021 @*/
1022 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
1023 {
1024   PetscErrorCode ierr;
1025   PetscMPIInt    size;
1026 
1027   PetscFunctionBegin;
1028   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1029   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
1030   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1031   if (size > 1) {
1032     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
1033     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1034   } else {
1035     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1036     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
1037   }
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 /*MC
1042    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
1043 
1044    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1045    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1046    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1047 
1048    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
1049    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
1050    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
1051    for communicators controlling multiple processes.  It is recommended that you call both of
1052    the above preallocation routines for simplicity.
1053 
1054    Options Database Keys:
1055 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
1056 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1057 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1058 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1059 
1060   Level: beginner
1061 
1062  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1063 M
1064 M*/
1065 
1066 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1067 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
1068 {
1069 #if defined(PETSC_USE_CTABLE)
1070   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
1071 #else
1072   PetscSplitCSRDataStructure **p_d_mat;
1073   PetscMPIInt                size,rank;
1074   MPI_Comm                   comm;
1075   PetscErrorCode             ierr;
1076   int                        *ai,*bi,*aj,*bj;
1077   PetscScalar                *aa,*ba;
1078 
1079   PetscFunctionBegin;
1080   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1081   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1082   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
1083   if (A->factortype == MAT_FACTOR_NONE) {
1084     CsrMatrix *matrixA,*matrixB=NULL;
1085     if (size == 1) {
1086       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1087       p_d_mat = &cusparsestruct->deviceMat;
1088       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1089       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1090         matrixA = (CsrMatrix*)matstruct->mat;
1091         bi = bj = NULL; ba = NULL;
1092       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
1093     } else {
1094       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
1095       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
1096       p_d_mat = &spptr->deviceMat;
1097       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
1098       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
1099       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
1100       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
1101       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
1102         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
1103         matrixA = (CsrMatrix*)matstructA->mat;
1104         matrixB = (CsrMatrix*)matstructB->mat;
1105         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
1106         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
1107         ba = thrust::raw_pointer_cast(matrixB->values->data());
1108       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
1109     }
1110     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
1111     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
1112     aa = thrust::raw_pointer_cast(matrixA->values->data());
1113   } else {
1114     *B = NULL;
1115     PetscFunctionReturn(0);
1116   }
1117   // act like MatSetValues because not called on host
1118   if (A->assembled) {
1119     if (A->was_assembled) {
1120       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
1121     }
1122     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1123   } else {
1124     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
1125   }
1126   if (!*p_d_mat) {
1127     cudaError_t                 err;
1128     PetscSplitCSRDataStructure  *d_mat, h_mat;
1129     Mat_SeqAIJ                  *jaca;
1130     PetscInt                    n = A->rmap->n, nnz;
1131     // create and copy
1132     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
1133     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
1134     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
1135     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
1136     if (size == 1) {
1137       jaca = (Mat_SeqAIJ*)A->data;
1138       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1139       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1140       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1141       h_mat.offdiag.a = NULL;
1142     } else {
1143       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
1144       Mat_SeqAIJ  *jacb;
1145       jaca = (Mat_SeqAIJ*)aij->A->data;
1146       jacb = (Mat_SeqAIJ*)aij->B->data;
1147       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
1148       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
1149       // create colmap - this is ussually done (lazy) in MatSetValues
1150       aij->donotstash = PETSC_TRUE;
1151       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1152       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
1153       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
1154       aij->colmap[A->cmap->N] = -9;
1155       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
1156       {
1157         PetscInt ii;
1158         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
1159       }
1160       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
1161       // allocate B copy data
1162       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1163       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1164       nnz = jacb->i[n];
1165 
1166       if (jacb->compressedrow.use) {
1167         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
1168         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1169       } else h_mat.offdiag.i = bi;
1170       h_mat.offdiag.j = bj;
1171       h_mat.offdiag.a = ba;
1172 
1173       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
1174       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1175       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1176       h_mat.offdiag.n = n;
1177     }
1178     // allocate A copy data
1179     nnz = jaca->i[n];
1180     h_mat.diag.n = n;
1181     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1182     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
1183     if (jaca->compressedrow.use) {
1184       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
1185       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1186     } else {
1187       h_mat.diag.i = ai;
1188     }
1189     h_mat.diag.j = aj;
1190     h_mat.diag.a = aa;
1191     // copy pointers and metdata to device
1192     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
1193     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
1194   } else {
1195     *B = *p_d_mat;
1196   }
1197   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1198   PetscFunctionReturn(0);
1199 #endif
1200 }
1201