xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision f6b722a57d07b8c3a218465fb7fc5d7800a7778a)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_CXX_COMPLEX_FIX
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 
5 #include <petscconf.h>
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9 #include <thrust/advance.h>
10 #include <petscsf.h>
11 
12 struct VecCUDAEquals
13 {
14   template <typename Tuple>
15   __host__ __device__
16   void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23 {
24   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
25   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26   PetscInt           n = cusp->coo_nd + cusp->coo_no;
27   PetscErrorCode     ierr;
28   cudaError_t        cerr;
29 
30   PetscFunctionBegin;
31   if (cusp->coo_p && v) {
32     thrust::device_ptr<const PetscScalar> d_v;
33     THRUSTARRAY                           *w = NULL;
34 
35     if (isCudaMem(v)) {
36       d_v = thrust::device_pointer_cast(v);
37     } else {
38       w = new THRUSTARRAY(n);
39       w->assign(v,v+n);
40       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
41       d_v = w->data();
42     }
43 
44     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
45                                                               cusp->coo_pw->begin()));
46     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
47                                                               cusp->coo_pw->end()));
48     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
49     thrust::for_each(zibit,zieit,VecCUDAEquals());
50     cerr = WaitForCUDA();CHKERRCUDA(cerr);
51     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52     delete w;
53     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
55   } else {
56     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
57     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
58   }
59   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
60   A->num_ass++;
61   A->assembled        = PETSC_TRUE;
62   A->ass_nonzerostate = A->nonzerostate;
63   A->offloadmask      = PETSC_OFFLOAD_GPU;
64   PetscFunctionReturn(0);
65 }
66 
67 template <typename Tuple>
68 struct IsNotOffDiagT
69 {
70   PetscInt _cstart,_cend;
71 
72   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73   __host__ __device__
74   inline bool operator()(Tuple t)
75   {
76     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77   }
78 };
79 
80 struct IsOffDiag
81 {
82   PetscInt _cstart,_cend;
83 
84   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85   __host__ __device__
86   inline bool operator() (const PetscInt &c)
87   {
88     return c < _cstart || c >= _cend;
89   }
90 };
91 
92 struct GlobToLoc
93 {
94   PetscInt _start;
95 
96   GlobToLoc(PetscInt start) : _start(start) {}
97   __host__ __device__
98   inline PetscInt operator() (const PetscInt &c)
99   {
100     return c - _start;
101   }
102 };
103 
104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
105 {
106   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108   PetscErrorCode         ierr;
109   PetscInt               *jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n);
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114   cudaError_t            cerr;
115 
116   PetscFunctionBegin;
117   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
118   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
119   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
120   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
121   ierr = PetscFree(b->garray);CHKERRQ(ierr);
122   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
123   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
124   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
125 
126   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
127   d_i.assign(coo_i,coo_i+n);
128   d_j.assign(coo_j,coo_j+n);
129   delete cusp->coo_p;
130   delete cusp->coo_pw;
131   cusp->coo_p = NULL;
132   cusp->coo_pw = NULL;
133   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
134   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
135   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
136   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
137     cusp->coo_p = new THRUSTINTARRAY(n);
138     cusp->coo_pw = new THRUSTARRAY(n);
139     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
140     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
141     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
142     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
143     firstoffd = mzipp.get_iterator_tuple().get<1>();
144   }
145   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
146   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
147 
148   /* from global to local */
149   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
150   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
151   cerr = WaitForCUDA();CHKERRCUDA(cerr);
152   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
153 
154   /* copy offdiag column indices to map on the CPU */
155   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
156   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
157   auto o_j = d_j.begin();
158   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
159   thrust::advance(o_j,cusp->coo_nd);
160   thrust::sort(thrust::device,o_j,d_j.end());
161   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
162   cerr = WaitForCUDA();CHKERRCUDA(cerr);
163   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
164   noff = thrust::distance(o_j,wit);
165   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
166   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
167   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
168   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
169   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
170   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
171   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
172   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
173 
174   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
175   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
176   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
177   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
178   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
179   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
180   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
181   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
182 
183   /* GPU memory, cusparse specific call handles it internally */
184   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
185   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
186   ierr = PetscFree(jj);CHKERRQ(ierr);
187 
188   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
189   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
190   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
191   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
192   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
193   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
194   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
195   B->preallocated = PETSC_TRUE;
196   B->nonzerostate++;
197 
198   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
199   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
200   B->offloadmask = PETSC_OFFLOAD_CPU;
201   B->assembled = PETSC_FALSE;
202   B->was_assembled = PETSC_FALSE;
203   PetscFunctionReturn(0);
204 }
205 
206 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
207 {
208   Mat            Ad,Ao;
209   const PetscInt *cmap;
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
214   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
215   if (glob) {
216     PetscInt cst, i, dn, on, *gidx;
217 
218     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
219     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
220     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
221     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
222     for (i=0; i<dn; i++) gidx[i]    = cst + i;
223     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
224     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
225   }
226   PetscFunctionReturn(0);
227 }
228 
229 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
230 {
231   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
232   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
233   PetscErrorCode     ierr;
234   PetscInt           i;
235 
236   PetscFunctionBegin;
237   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
238   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
239   if (PetscDefined(USE_DEBUG) && d_nnz) {
240     for (i=0; i<B->rmap->n; i++) {
241       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
242     }
243   }
244   if (PetscDefined(USE_DEBUG) && o_nnz) {
245     for (i=0; i<B->rmap->n; i++) {
246       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
247     }
248   }
249 #if defined(PETSC_USE_CTABLE)
250   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
251 #else
252   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
253 #endif
254   ierr = PetscFree(b->garray);CHKERRQ(ierr);
255   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
256   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
257   /* Because the B will have been resized we simply destroy it and create a new one each time */
258   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
259   if (!b->A) {
260     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
261     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
262     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
263   }
264   if (!b->B) {
265     PetscMPIInt size;
266     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
267     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
268     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
269     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
270   }
271   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
272   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
273   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
274   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
275   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
276   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
277   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
278   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
279   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
280   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
281   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
282   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
283 
284   B->preallocated = PETSC_TRUE;
285   PetscFunctionReturn(0);
286 }
287 
288 /*@
289    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
290 
291    Not collective
292 
293    Input Parameters:
294 +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
295 -  gen - the boolean flag
296 
297    Level: intermediate
298 
299 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
300 @*/
301 PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
302 {
303   PetscErrorCode ierr;
304   PetscBool      ismpiaij;
305 
306   PetscFunctionBegin;
307   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
308   MatCheckPreallocated(A,1);
309   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
310   if (ismpiaij) {
311     Mat A_d,A_o;
312 
313     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
314     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
315     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
316   } else {
317     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
318   }
319   PetscFunctionReturn(0);
320 }
321 
322 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
323 {
324   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
325   PetscErrorCode ierr;
326   PetscInt       nt;
327 
328   PetscFunctionBegin;
329   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
330   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
331   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
332   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
333   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
334   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
335   PetscFunctionReturn(0);
336 }
337 
338 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
339 {
340   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
341   PetscErrorCode ierr;
342 
343   PetscFunctionBegin;
344   if (A->factortype == MAT_FACTOR_NONE) {
345     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
346     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
347     if (d_mat) {
348       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
349       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
350       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
351       cudaError_t  err;
352       PetscScalar  *vals;
353       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
354       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
355       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
356       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
357       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
358     }
359   }
360   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
361   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
362 
363   PetscFunctionReturn(0);
364 }
365 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
366 {
367   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
368   PetscErrorCode ierr;
369   PetscInt       nt;
370 
371   PetscFunctionBegin;
372   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
373   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
374   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
375   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
376   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
377   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
382 {
383   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
384   PetscErrorCode ierr;
385   PetscInt       nt;
386 
387   PetscFunctionBegin;
388   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
389   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
390   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
391   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
392   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
393   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
394   PetscFunctionReturn(0);
395 }
396 
397 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
398 {
399   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
400   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
401 
402   PetscFunctionBegin;
403   switch (op) {
404   case MAT_CUSPARSE_MULT_DIAG:
405     cusparseStruct->diagGPUMatFormat = format;
406     break;
407   case MAT_CUSPARSE_MULT_OFFDIAG:
408     cusparseStruct->offdiagGPUMatFormat = format;
409     break;
410   case MAT_CUSPARSE_ALL:
411     cusparseStruct->diagGPUMatFormat    = format;
412     cusparseStruct->offdiagGPUMatFormat = format;
413     break;
414   default:
415     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
416   }
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
421 {
422   MatCUSPARSEStorageFormat format;
423   PetscErrorCode           ierr;
424   PetscBool                flg;
425   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
426   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
427 
428   PetscFunctionBegin;
429   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
430   if (A->factortype==MAT_FACTOR_NONE) {
431     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
432                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
433     if (flg) {
434       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
435     }
436     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
437                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
438     if (flg) {
439       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
440     }
441     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
442                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
443     if (flg) {
444       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
445     }
446   }
447   ierr = PetscOptionsTail();CHKERRQ(ierr);
448   PetscFunctionReturn(0);
449 }
450 
451 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
452 {
453   PetscErrorCode             ierr;
454   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
455   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
456   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
457   PetscFunctionBegin;
458   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
459   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
460     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
461   }
462   if (d_mat) {
463     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
464   }
465 
466   PetscFunctionReturn(0);
467 }
468 
469 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
470 {
471   PetscErrorCode     ierr;
472   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
473   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
474   cudaError_t        err;
475   cusparseStatus_t   stat;
476 
477   PetscFunctionBegin;
478   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
479   if (cusparseStruct->deviceMat) {
480     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
481     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
482     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
483     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
484     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
485     if (jaca->compressedrow.use) {
486       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
487     }
488     if (jacb->compressedrow.use) {
489       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
490     }
491     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
492     err = cudaFree(d_mat);CHKERRCUDA(err);
493   }
494   try {
495     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
496     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
497     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
498     if (cusparseStruct->stream) {
499       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
500     }
501     delete cusparseStruct->coo_p;
502     delete cusparseStruct->coo_pw;
503     delete cusparseStruct;
504   } catch(char *ex) {
505     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
506   }
507   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
508   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
509   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
510   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
511   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
512   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
513   PetscFunctionReturn(0);
514 }
515 
516 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
517 {
518   PetscErrorCode     ierr;
519   Mat_MPIAIJ         *a;
520   Mat_MPIAIJCUSPARSE *cusparseStruct;
521   cusparseStatus_t   stat;
522   Mat                A;
523 
524   PetscFunctionBegin;
525   if (reuse == MAT_INITIAL_MATRIX) {
526     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
527   } else if (reuse == MAT_REUSE_MATRIX) {
528     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
529   }
530   A = *newmat;
531   A->boundtocpu = PETSC_FALSE;
532   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
533   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
534 
535   a = (Mat_MPIAIJ*)A->data;
536   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
537   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
538   if (a->lvec) {
539     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
540   }
541 
542   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
543     a->spptr = new Mat_MPIAIJCUSPARSE;
544 
545     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
546     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
547     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
548     cusparseStruct->coo_p               = NULL;
549     cusparseStruct->coo_pw              = NULL;
550     cusparseStruct->stream              = 0;
551     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
552     cusparseStruct->deviceMat = NULL;
553   }
554 
555   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
556   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
557   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
558   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
559   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
560   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
561   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
562   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
563 
564   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
565   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
566   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
567   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
568   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
569   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
570   PetscFunctionReturn(0);
571 }
572 
573 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
574 {
575   PetscErrorCode ierr;
576 
577   PetscFunctionBegin;
578   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
579   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
580   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 
584 /*@
585    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
586    (the default parallel PETSc format).  This matrix will ultimately pushed down
587    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
588    assembly performance the user should preallocate the matrix storage by setting
589    the parameter nz (or the array nnz).  By setting these parameters accurately,
590    performance during matrix assembly can be increased by more than a factor of 50.
591 
592    Collective
593 
594    Input Parameters:
595 +  comm - MPI communicator, set to PETSC_COMM_SELF
596 .  m - number of rows
597 .  n - number of columns
598 .  nz - number of nonzeros per row (same for all rows)
599 -  nnz - array containing the number of nonzeros in the various rows
600          (possibly different for each row) or NULL
601 
602    Output Parameter:
603 .  A - the matrix
604 
605    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
606    MatXXXXSetPreallocation() paradigm instead of this routine directly.
607    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
608 
609    Notes:
610    If nnz is given then nz is ignored
611 
612    The AIJ format (also called the Yale sparse matrix format or
613    compressed row storage), is fully compatible with standard Fortran 77
614    storage.  That is, the stored row and column indices can begin at
615    either one (as in Fortran) or zero.  See the users' manual for details.
616 
617    Specify the preallocated storage with either nz or nnz (not both).
618    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
619    allocation.  For large problems you MUST preallocate memory or you
620    will get TERRIBLE performance, see the users' manual chapter on matrices.
621 
622    By default, this format uses inodes (identical nodes) when possible, to
623    improve numerical efficiency of matrix-vector products and solves. We
624    search for consecutive rows with the same nonzero structure, thereby
625    reusing matrix information to achieve increased efficiency.
626 
627    Level: intermediate
628 
629 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
630 @*/
631 PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
632 {
633   PetscErrorCode ierr;
634   PetscMPIInt    size;
635 
636   PetscFunctionBegin;
637   ierr = MatCreate(comm,A);CHKERRQ(ierr);
638   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
639   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
640   if (size > 1) {
641     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
642     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
643   } else {
644     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
645     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
646   }
647   PetscFunctionReturn(0);
648 }
649 
650 /*MC
651    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
652 
653    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
654    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
655    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
656 
657    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
658    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
659    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
660    for communicators controlling multiple processes.  It is recommended that you call both of
661    the above preallocation routines for simplicity.
662 
663    Options Database Keys:
664 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
665 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
666 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
667 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
668 
669   Level: beginner
670 
671  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
672 M
673 M*/
674 
675 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
676 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
677 {
678 #if defined(PETSC_USE_CTABLE)
679   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
680 #else
681   PetscSplitCSRDataStructure **p_d_mat;
682   PetscMPIInt                size,rank;
683   MPI_Comm                   comm;
684   PetscErrorCode             ierr;
685   int                        *ai,*bi,*aj,*bj;
686   PetscScalar                *aa,*ba;
687 
688   PetscFunctionBegin;
689   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
690   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
691   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
692   if (A->factortype == MAT_FACTOR_NONE) {
693     CsrMatrix *matrixA,*matrixB=NULL;
694     if (size == 1) {
695       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
696       p_d_mat = &cusparsestruct->deviceMat;
697       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
698       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
699         matrixA = (CsrMatrix*)matstruct->mat;
700         bi = bj = NULL; ba = NULL;
701       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
702     } else {
703       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
704       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
705       p_d_mat = &spptr->deviceMat;
706       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
707       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
708       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
709       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
710       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
711         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
712         matrixA = (CsrMatrix*)matstructA->mat;
713         matrixB = (CsrMatrix*)matstructB->mat;
714         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
715         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
716         ba = thrust::raw_pointer_cast(matrixB->values->data());
717       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
718     }
719     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
720     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
721     aa = thrust::raw_pointer_cast(matrixA->values->data());
722   } else {
723     *B = NULL;
724     PetscFunctionReturn(0);
725   }
726   // act like MatSetValues because not called on host
727   if (A->assembled) {
728     if (A->was_assembled) {
729       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
730     }
731     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
732   } else {
733     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
734   }
735   if (!*p_d_mat) {
736     cudaError_t                 err;
737     PetscSplitCSRDataStructure  *d_mat, h_mat;
738     Mat_SeqAIJ                  *jaca;
739     PetscInt                    n = A->rmap->n, nnz;
740     // create and copy
741     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
742     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
743     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
744     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
745     if (size == 1) {
746       jaca = (Mat_SeqAIJ*)A->data;
747       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
748       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
749       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
750       h_mat.offdiag.a = NULL;
751     } else {
752       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
753       Mat_SeqAIJ  *jacb;
754       jaca = (Mat_SeqAIJ*)aij->A->data;
755       jacb = (Mat_SeqAIJ*)aij->B->data;
756       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
757       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
758       // create colmap - this is ussually done (lazy) in MatSetValues
759       aij->donotstash = PETSC_TRUE;
760       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
761       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
762       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
763       aij->colmap[A->cmap->N] = -9;
764       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
765       {
766         PetscInt ii;
767         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
768       }
769       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
770       // allocate B copy data
771       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
772       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
773       nnz = jacb->i[n];
774 
775       if (jacb->compressedrow.use) {
776         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
777         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
778       } else h_mat.offdiag.i = bi;
779       h_mat.offdiag.j = bj;
780       h_mat.offdiag.a = ba;
781 
782       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
783       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
784       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
785       h_mat.offdiag.n = n;
786     }
787     // allocate A copy data
788     nnz = jaca->i[n];
789     h_mat.diag.n = n;
790     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
791     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
792     if (jaca->compressedrow.use) {
793       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
794       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
795     } else {
796       h_mat.diag.i = ai;
797     }
798     h_mat.diag.j = aj;
799     h_mat.diag.a = aa;
800     // copy pointers and metdata to device
801     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
802     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
803   } else {
804     *B = *p_d_mat;
805   }
806   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
807   PetscFunctionReturn(0);
808 #endif
809 }
810