xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 7e8381f984f038c2461ffb95eefd9a80face03cb)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9*7e8381f9SStefano Zampini #include <thrust/advance.h>
10*7e8381f9SStefano Zampini 
11*7e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
12*7e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
13*7e8381f9SStefano Zampini 
14*7e8381f9SStefano Zampini struct VecCUDAEquals
15*7e8381f9SStefano Zampini {
16*7e8381f9SStefano Zampini   template <typename Tuple>
17*7e8381f9SStefano Zampini   __host__ __device__
18*7e8381f9SStefano Zampini   void operator()(Tuple t)
19*7e8381f9SStefano Zampini   {
20*7e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
21*7e8381f9SStefano Zampini   }
22*7e8381f9SStefano Zampini };
23*7e8381f9SStefano Zampini 
24*7e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
25*7e8381f9SStefano Zampini {
26*7e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
27*7e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
28*7e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
29*7e8381f9SStefano Zampini   PetscErrorCode     ierr;
30*7e8381f9SStefano Zampini   cudaError_t        cerr;
31*7e8381f9SStefano Zampini 
32*7e8381f9SStefano Zampini   PetscFunctionBegin;
33*7e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
34*7e8381f9SStefano Zampini   if (cusp->coo_p) {
35*7e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
36*7e8381f9SStefano Zampini     THRUSTARRAY w(v,v+n);
37*7e8381f9SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->begin()),
38*7e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
39*7e8381f9SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->end()),
40*7e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
41*7e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
42*7e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
43*7e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
44*7e8381f9SStefano Zampini   } else {
45*7e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
46*7e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
47*7e8381f9SStefano Zampini   }
48*7e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
49*7e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
50*7e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
51*7e8381f9SStefano Zampini   A->num_ass++;
52*7e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
53*7e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
54*7e8381f9SStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_BOTH;
55*7e8381f9SStefano Zampini   PetscFunctionReturn(0);
56*7e8381f9SStefano Zampini }
57*7e8381f9SStefano Zampini 
58*7e8381f9SStefano Zampini template <typename Tuple>
59*7e8381f9SStefano Zampini struct IsNotOffDiagT
60*7e8381f9SStefano Zampini {
61*7e8381f9SStefano Zampini   PetscInt _cstart,_cend;
62*7e8381f9SStefano Zampini 
63*7e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
64*7e8381f9SStefano Zampini   __host__ __device__
65*7e8381f9SStefano Zampini   inline bool operator()(Tuple t)
66*7e8381f9SStefano Zampini   {
67*7e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
68*7e8381f9SStefano Zampini   }
69*7e8381f9SStefano Zampini };
70*7e8381f9SStefano Zampini 
71*7e8381f9SStefano Zampini struct IsOffDiag
72*7e8381f9SStefano Zampini {
73*7e8381f9SStefano Zampini   PetscInt _cstart,_cend;
74*7e8381f9SStefano Zampini 
75*7e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
76*7e8381f9SStefano Zampini   __host__ __device__
77*7e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
78*7e8381f9SStefano Zampini   {
79*7e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
80*7e8381f9SStefano Zampini   }
81*7e8381f9SStefano Zampini };
82*7e8381f9SStefano Zampini 
83*7e8381f9SStefano Zampini struct GlobToLoc
84*7e8381f9SStefano Zampini {
85*7e8381f9SStefano Zampini   PetscInt _start;
86*7e8381f9SStefano Zampini 
87*7e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
88*7e8381f9SStefano Zampini   __host__ __device__
89*7e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
90*7e8381f9SStefano Zampini   {
91*7e8381f9SStefano Zampini     return c - _start;
92*7e8381f9SStefano Zampini   }
93*7e8381f9SStefano Zampini };
94*7e8381f9SStefano Zampini 
95*7e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
96*7e8381f9SStefano Zampini {
97*7e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
98*7e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
99*7e8381f9SStefano Zampini   PetscErrorCode         ierr;
100*7e8381f9SStefano Zampini   PetscInt               *jj;
101*7e8381f9SStefano Zampini   size_t                 noff = 0;
102*7e8381f9SStefano Zampini   THRUSTINTARRAY         d_i(n);
103*7e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
104*7e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
105*7e8381f9SStefano Zampini   cudaError_t            cerr;
106*7e8381f9SStefano Zampini 
107*7e8381f9SStefano Zampini   PetscFunctionBegin;
108*7e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
109*7e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
110*7e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
111*7e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
112*7e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
113*7e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
114*7e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
115*7e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
116*7e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
117*7e8381f9SStefano Zampini 
118*7e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
119*7e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
120*7e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
121*7e8381f9SStefano Zampini   delete cusp->coo_p;
122*7e8381f9SStefano Zampini   delete cusp->coo_pw;
123*7e8381f9SStefano Zampini   cusp->coo_p = NULL;
124*7e8381f9SStefano Zampini   cusp->coo_pw = NULL;
125*7e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
126*7e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
127*7e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
128*7e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
129*7e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
130*7e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
131*7e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
132*7e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
133*7e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
134*7e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
135*7e8381f9SStefano Zampini   }
136*7e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
137*7e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
138*7e8381f9SStefano Zampini 
139*7e8381f9SStefano Zampini   /* from global to local */
140*7e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
141*7e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
142*7e8381f9SStefano Zampini 
143*7e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
144*7e8381f9SStefano Zampini   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
145*7e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
146*7e8381f9SStefano Zampini   auto o_j = d_j.begin();
147*7e8381f9SStefano Zampini   thrust::advance(o_j,cusp->coo_nd);
148*7e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
149*7e8381f9SStefano Zampini   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
150*7e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
151*7e8381f9SStefano Zampini   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
152*7e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153*7e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
154*7e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
155*7e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
156*7e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
157*7e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
158*7e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
159*7e8381f9SStefano Zampini 
160*7e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
161*7e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
162*7e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
163*7e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
164*7e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
165*7e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
166*7e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
167*7e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
168*7e8381f9SStefano Zampini 
169*7e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
170*7e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
171*7e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
172*7e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
173*7e8381f9SStefano Zampini 
174*7e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
175*7e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
176*7e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
177*7e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
178*7e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
179*7e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
180*7e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
181*7e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
182*7e8381f9SStefano Zampini   B->nonzerostate++;
183*7e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
184*7e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
185*7e8381f9SStefano Zampini 
186*7e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
187*7e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
188*7e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
189*7e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
190*7e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
191*7e8381f9SStefano Zampini   PetscFunctionReturn(0);
192*7e8381f9SStefano Zampini }
1939ae82921SPaul Mullowney 
1949ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
1959ae82921SPaul Mullowney {
196bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
197bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
1989ae82921SPaul Mullowney   PetscErrorCode     ierr;
1999ae82921SPaul Mullowney   PetscInt           i;
2009ae82921SPaul Mullowney 
2019ae82921SPaul Mullowney   PetscFunctionBegin;
2029ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2039ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
204*7e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2059ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2069ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2079ae82921SPaul Mullowney     }
2089ae82921SPaul Mullowney   }
209*7e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2109ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2119ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2129ae82921SPaul Mullowney     }
2139ae82921SPaul Mullowney   }
2149ae82921SPaul Mullowney   if (!B->preallocated) {
215bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
2169ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
217b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2189ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2193bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2209ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
221b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2229ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
2233bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2249ae82921SPaul Mullowney   }
225d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
226d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
227d98d7c49SStefano Zampini   if (b->lvec) {
228d98d7c49SStefano Zampini     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
229d98d7c49SStefano Zampini   }
2309ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2319ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
232e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
233e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
234b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
235b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
236b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
237b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
2382205254eSKarl Rupp 
2399ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2409ae82921SPaul Mullowney   PetscFunctionReturn(0);
2419ae82921SPaul Mullowney }
242e057df02SPaul Mullowney 
2439ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2449ae82921SPaul Mullowney {
2459ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
2469ae82921SPaul Mullowney   PetscErrorCode ierr;
2479ae82921SPaul Mullowney   PetscInt       nt;
2489ae82921SPaul Mullowney 
2499ae82921SPaul Mullowney   PetscFunctionBegin;
2509ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
2519ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
2529ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2534d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
2549ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2559ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
2569ae82921SPaul Mullowney   PetscFunctionReturn(0);
2579ae82921SPaul Mullowney }
258ca45077fSPaul Mullowney 
2593fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
2603fa6b06aSMark Adams {
2613fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
2623fa6b06aSMark Adams   PetscErrorCode ierr;
2633fa6b06aSMark Adams 
2643fa6b06aSMark Adams   PetscFunctionBegin;
2653fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
2663fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
2673fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
2683fa6b06aSMark Adams     if (d_mat) {
2693fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
2703fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
2713fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
2723fa6b06aSMark Adams       cudaError_t  err;
2733fa6b06aSMark Adams       PetscScalar  *vals;
2743fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
2753fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2763fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
2773fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2783fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
2793fa6b06aSMark Adams     }
2803fa6b06aSMark Adams   }
2813fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
2823fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
2833fa6b06aSMark Adams 
2843fa6b06aSMark Adams   PetscFunctionReturn(0);
2853fa6b06aSMark Adams }
286fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
287fdc842d1SBarry Smith {
288fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
289fdc842d1SBarry Smith   PetscErrorCode ierr;
290fdc842d1SBarry Smith   PetscInt       nt;
291fdc842d1SBarry Smith 
292fdc842d1SBarry Smith   PetscFunctionBegin;
293fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
294fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2964d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
297fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
298fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
299fdc842d1SBarry Smith   PetscFunctionReturn(0);
300fdc842d1SBarry Smith }
301fdc842d1SBarry Smith 
302ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
303ca45077fSPaul Mullowney {
304ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
305ca45077fSPaul Mullowney   PetscErrorCode ierr;
306ca45077fSPaul Mullowney   PetscInt       nt;
307ca45077fSPaul Mullowney 
308ca45077fSPaul Mullowney   PetscFunctionBegin;
309ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
310ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
3119b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
312ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
3139b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
3149b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
315ca45077fSPaul Mullowney   PetscFunctionReturn(0);
316ca45077fSPaul Mullowney }
3179ae82921SPaul Mullowney 
318e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
319ca45077fSPaul Mullowney {
320ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
321bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
322e057df02SPaul Mullowney 
323ca45077fSPaul Mullowney   PetscFunctionBegin;
324e057df02SPaul Mullowney   switch (op) {
325e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
326e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
327045c96e1SPaul Mullowney     break;
328e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
329e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
330045c96e1SPaul Mullowney     break;
331e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
332e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
333e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
334045c96e1SPaul Mullowney     break;
335e057df02SPaul Mullowney   default:
336e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
337045c96e1SPaul Mullowney   }
338ca45077fSPaul Mullowney   PetscFunctionReturn(0);
339ca45077fSPaul Mullowney }
340e057df02SPaul Mullowney 
3414416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
342e057df02SPaul Mullowney {
343e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
344e057df02SPaul Mullowney   PetscErrorCode           ierr;
345e057df02SPaul Mullowney   PetscBool                flg;
346a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
347a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
3485fd66863SKarl Rupp 
349e057df02SPaul Mullowney   PetscFunctionBegin;
350e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
351e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
352e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
353a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
354e057df02SPaul Mullowney     if (flg) {
355e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
356e057df02SPaul Mullowney     }
357e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
358a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
359e057df02SPaul Mullowney     if (flg) {
360e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
361e057df02SPaul Mullowney     }
362e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
363a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
364e057df02SPaul Mullowney     if (flg) {
365e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
366e057df02SPaul Mullowney     }
367e057df02SPaul Mullowney   }
3680af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
369e057df02SPaul Mullowney   PetscFunctionReturn(0);
370e057df02SPaul Mullowney }
371e057df02SPaul Mullowney 
37234d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
37334d6c7a5SJose E. Roman {
37434d6c7a5SJose E. Roman   PetscErrorCode             ierr;
3753fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
3763fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
3773fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
3783fa6b06aSMark Adams   PetscInt                   nnz_state = A->nonzerostate;
37934d6c7a5SJose E. Roman   PetscFunctionBegin;
3803fa6b06aSMark Adams   if (d_mat) {
3813fa6b06aSMark Adams     cudaError_t                err;
3823fa6b06aSMark Adams     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3833fa6b06aSMark Adams   }
38434d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
38534d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
38634d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
38734d6c7a5SJose E. Roman   }
3883fa6b06aSMark Adams   if (nnz_state > A->nonzerostate) {
3893fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
3903fa6b06aSMark Adams   }
3913fa6b06aSMark Adams 
39234d6c7a5SJose E. Roman   PetscFunctionReturn(0);
39334d6c7a5SJose E. Roman }
39434d6c7a5SJose E. Roman 
395bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
396bbf3fe20SPaul Mullowney {
397bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
3983fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
3993fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
400b06137fdSPaul Mullowney   cudaError_t        err;
401b06137fdSPaul Mullowney   cusparseStatus_t   stat;
402bbf3fe20SPaul Mullowney 
403bbf3fe20SPaul Mullowney   PetscFunctionBegin;
404d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
4053fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
4063fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
4073fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
4083fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
4093fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
4103fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
4113fa6b06aSMark Adams     if (jaca->compressedrow.use) {
4123fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
4133fa6b06aSMark Adams     }
4143fa6b06aSMark Adams     if (jacb->compressedrow.use) {
4153fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
4163fa6b06aSMark Adams     }
4173fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
4183fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
4193fa6b06aSMark Adams   }
420bbf3fe20SPaul Mullowney   try {
4213fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
4223fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
42357d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
42417403302SKarl Rupp     if (cusparseStruct->stream) {
425c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
42617403302SKarl Rupp     }
427*7e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
428*7e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
429bbf3fe20SPaul Mullowney     delete cusparseStruct;
430bbf3fe20SPaul Mullowney   } catch(char *ex) {
431bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
432bbf3fe20SPaul Mullowney   }
4333338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
434*7e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
435*7e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
4363338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
437bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
438bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
439bbf3fe20SPaul Mullowney }
440ca45077fSPaul Mullowney 
4413338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
4429ae82921SPaul Mullowney {
4439ae82921SPaul Mullowney   PetscErrorCode     ierr;
444bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
445bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
446b06137fdSPaul Mullowney   cusparseStatus_t   stat;
4473338378cSStefano Zampini   Mat                A;
4489ae82921SPaul Mullowney 
4499ae82921SPaul Mullowney   PetscFunctionBegin;
4503338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4513338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4523338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
4533338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
4543338378cSStefano Zampini   }
4553338378cSStefano Zampini   A = *newmat;
4563338378cSStefano Zampini 
45734136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
45834136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
45934136279SStefano Zampini 
460bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
461d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
462d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
463d98d7c49SStefano Zampini   if (a->lvec) {
464d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
465d98d7c49SStefano Zampini   }
466d98d7c49SStefano Zampini 
4673338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
468bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
4692205254eSKarl Rupp 
470bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
471e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
472e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
473*7e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
474*7e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
47517403302SKarl Rupp     cusparseStruct->stream              = 0;
47657d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
4773fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
4783338378cSStefano Zampini   }
4792205254eSKarl Rupp 
48034d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
481bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
482fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
483bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
484bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
485bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
4863fa6b06aSMark Adams   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
4872205254eSKarl Rupp 
488bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
4893338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
490bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
491*7e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
492*7e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
4939ae82921SPaul Mullowney   PetscFunctionReturn(0);
4949ae82921SPaul Mullowney }
4959ae82921SPaul Mullowney 
4963338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
4973338378cSStefano Zampini {
4983338378cSStefano Zampini   PetscErrorCode ierr;
4993338378cSStefano Zampini 
5003338378cSStefano Zampini   PetscFunctionBegin;
50105035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
5023338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
5033338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5043338378cSStefano Zampini   PetscFunctionReturn(0);
5053338378cSStefano Zampini }
5063338378cSStefano Zampini 
5073f9c0db1SPaul Mullowney /*@
5083f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
509e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
5103f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
511e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
512e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
513e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
5149ae82921SPaul Mullowney 
515d083f849SBarry Smith    Collective
516e057df02SPaul Mullowney 
517e057df02SPaul Mullowney    Input Parameters:
518e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
519e057df02SPaul Mullowney .  m - number of rows
520e057df02SPaul Mullowney .  n - number of columns
521e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
522e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
5230298fd71SBarry Smith          (possibly different for each row) or NULL
524e057df02SPaul Mullowney 
525e057df02SPaul Mullowney    Output Parameter:
526e057df02SPaul Mullowney .  A - the matrix
527e057df02SPaul Mullowney 
528e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
529e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
530e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
531e057df02SPaul Mullowney 
532e057df02SPaul Mullowney    Notes:
533e057df02SPaul Mullowney    If nnz is given then nz is ignored
534e057df02SPaul Mullowney 
535e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
536e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
537e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
538e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
539e057df02SPaul Mullowney 
540e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
5410298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
542e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
543e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
544e057df02SPaul Mullowney 
545e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
546e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
547e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
548e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
549e057df02SPaul Mullowney 
550e057df02SPaul Mullowney    Level: intermediate
551e057df02SPaul Mullowney 
552e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
553e057df02SPaul Mullowney @*/
5549ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
5559ae82921SPaul Mullowney {
5569ae82921SPaul Mullowney   PetscErrorCode ierr;
5579ae82921SPaul Mullowney   PetscMPIInt    size;
5589ae82921SPaul Mullowney 
5599ae82921SPaul Mullowney   PetscFunctionBegin;
5609ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
5619ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
5629ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
5639ae82921SPaul Mullowney   if (size > 1) {
5649ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
5659ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
5669ae82921SPaul Mullowney   } else {
5679ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
5689ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
5699ae82921SPaul Mullowney   }
5709ae82921SPaul Mullowney   PetscFunctionReturn(0);
5719ae82921SPaul Mullowney }
5729ae82921SPaul Mullowney 
5733ca39a21SBarry Smith /*MC
574e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
575e057df02SPaul Mullowney 
5762692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
5772692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
5782692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
5799ae82921SPaul Mullowney 
5809ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
5819ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
5829ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
5839ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
5849ae82921SPaul Mullowney    the above preallocation routines for simplicity.
5859ae82921SPaul Mullowney 
5869ae82921SPaul Mullowney    Options Database Keys:
587e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
5888468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
5898468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
5908468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
5919ae82921SPaul Mullowney 
5929ae82921SPaul Mullowney   Level: beginner
5939ae82921SPaul Mullowney 
5948468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
5958468deeeSKarl Rupp M
5969ae82921SPaul Mullowney M*/
5973fa6b06aSMark Adams 
5983fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
5993fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
6003fa6b06aSMark Adams {
6019db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
6029db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
6039db3cbf9SStefano Zampini #else
6043fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
6053fa6b06aSMark Adams   PetscMPIInt                size,rank;
6063fa6b06aSMark Adams   MPI_Comm                   comm;
6073fa6b06aSMark Adams   PetscErrorCode             ierr;
6083fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
6093fa6b06aSMark Adams   PetscScalar                *aa,*ba;
6103fa6b06aSMark Adams 
6113fa6b06aSMark Adams   PetscFunctionBegin;
6123fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
6133fa6b06aSMark Adams   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
6143fa6b06aSMark Adams   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
6153fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
6163fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
6173fa6b06aSMark Adams     if (size == 1) {
6183fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
6193fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
6203fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
6213fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
6223fa6b06aSMark Adams 	matrixA = (CsrMatrix*)matstruct->mat;
6233fa6b06aSMark Adams 	bi = bj = NULL; ba = NULL;
6243fa6b06aSMark Adams       } else {
6253fa6b06aSMark Adams 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
6263fa6b06aSMark Adams       }
6273fa6b06aSMark Adams     } else {
6283fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
6293fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
6303fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
6313fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
6323fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
6333fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
6343fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
6353fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
6363fa6b06aSMark Adams 	if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
6373fa6b06aSMark Adams 	matrixA = (CsrMatrix*)matstructA->mat;
6383fa6b06aSMark Adams 	matrixB = (CsrMatrix*)matstructB->mat;
6393fa6b06aSMark Adams 	bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
6403fa6b06aSMark Adams 	bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
6413fa6b06aSMark Adams 	ba = thrust::raw_pointer_cast(matrixB->values->data());
6423fa6b06aSMark Adams 	if (rank==-1) {
6433fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
6443fa6b06aSMark Adams 	    std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
6453fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
6463fa6b06aSMark Adams 	    std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
6473fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->values->size(); i++)
6483fa6b06aSMark Adams 	    std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
6493fa6b06aSMark Adams 	}
6503fa6b06aSMark Adams       } else {
6513fa6b06aSMark Adams 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
6523fa6b06aSMark Adams       }
6533fa6b06aSMark Adams     }
6543fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
6553fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
6563fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
6573fa6b06aSMark Adams   } else {
6583fa6b06aSMark Adams     *B = NULL;
6593fa6b06aSMark Adams     PetscFunctionReturn(0);
6603fa6b06aSMark Adams   }
6613fa6b06aSMark Adams   // act like MatSetValues because not called on host
6623fa6b06aSMark Adams   if (A->assembled) {
6633fa6b06aSMark Adams     if (A->was_assembled) {
6643fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
6653fa6b06aSMark Adams     }
6663fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
6673fa6b06aSMark Adams   } else {
6683fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
6693fa6b06aSMark Adams   }
6703fa6b06aSMark Adams   if (!*p_d_mat) {
6713fa6b06aSMark Adams     cudaError_t                 err;
6723fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
6733fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
6743fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
6753fa6b06aSMark Adams     // create and copy
6763fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
6773fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
6783fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
6793fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
6803fa6b06aSMark Adams     if (size == 1) {
6813fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
6823fa6b06aSMark Adams       h_mat.nonzerostate = A->nonzerostate;
6833fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
6843fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
6853fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
6863fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
6873fa6b06aSMark Adams       h_mat.seq = PETSC_TRUE;
6883fa6b06aSMark Adams     } else {
6893fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
6903fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
6913fa6b06aSMark Adams       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
6923fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
6933fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
6943fa6b06aSMark Adams       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
6953fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
6963fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
6973fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
6983fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
6993fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
7003fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
7013fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
7023fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
7033fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
7043fa6b06aSMark Adams       {
7053fa6b06aSMark Adams 	PetscInt ii;
7063fa6b06aSMark Adams 	for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
7073fa6b06aSMark Adams       }
7083fa6b06aSMark Adams       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
7093fa6b06aSMark Adams       // allocate B copy data
7103fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
7113fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
7123fa6b06aSMark Adams       nnz = jacb->i[n];
7133fa6b06aSMark Adams 
7143fa6b06aSMark Adams       if (jacb->compressedrow.use) {
7153fa6b06aSMark Adams 	err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7163fa6b06aSMark Adams 	err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7173fa6b06aSMark Adams       } else {
7183fa6b06aSMark Adams 	h_mat.offdiag.i = bi;
7193fa6b06aSMark Adams       }
7203fa6b06aSMark Adams       h_mat.offdiag.j = bj;
7213fa6b06aSMark Adams       h_mat.offdiag.a = ba;
7223fa6b06aSMark Adams 
7233fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
7243fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7253fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
7263fa6b06aSMark Adams       h_mat.offdiag.n = n;
7273fa6b06aSMark Adams     }
7283fa6b06aSMark Adams     // allocate A copy data
7293fa6b06aSMark Adams     nnz = jaca->i[n];
7303fa6b06aSMark Adams     h_mat.diag.n = n;
7313fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
7323fa6b06aSMark Adams     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
7333fa6b06aSMark Adams     if (jaca->compressedrow.use) {
7343fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7353fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7363fa6b06aSMark Adams     } else {
7373fa6b06aSMark Adams       h_mat.diag.i = ai;
7383fa6b06aSMark Adams     }
7393fa6b06aSMark Adams     h_mat.diag.j = aj;
7403fa6b06aSMark Adams     h_mat.diag.a = aa;
7413fa6b06aSMark Adams     // copy pointers and metdata to device
7423fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7433fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
7443fa6b06aSMark Adams   } else {
7453fa6b06aSMark Adams     *B = *p_d_mat;
7463fa6b06aSMark Adams   }
7473fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
7483fa6b06aSMark Adams   PetscFunctionReturn(0);
7499db3cbf9SStefano Zampini #endif
7503fa6b06aSMark Adams }
751