xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision a4af0ceea8a251db97ee0dc5c0d52d4adf50264a)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
94eb5378fSStefano Zampini #include <petscsf.h>
107e8381f9SStefano Zampini 
117e8381f9SStefano Zampini struct VecCUDAEquals
127e8381f9SStefano Zampini {
137e8381f9SStefano Zampini   template <typename Tuple>
147e8381f9SStefano Zampini   __host__ __device__
157e8381f9SStefano Zampini   void operator()(Tuple t)
167e8381f9SStefano Zampini   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
217e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
227e8381f9SStefano Zampini {
237e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
247e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
257e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
267e8381f9SStefano Zampini   PetscErrorCode     ierr;
277e8381f9SStefano Zampini 
287e8381f9SStefano Zampini   PetscFunctionBegin;
29e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3008391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3108391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3208391a17SStefano Zampini 
3308391a17SStefano Zampini     if (isCudaMem(v)) {
3408391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3508391a17SStefano Zampini     } else {
36e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
37e61fc153SStefano Zampini       w->assign(v,v+n);
3808391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3908391a17SStefano Zampini       d_v = w->data();
4008391a17SStefano Zampini     }
4108391a17SStefano Zampini 
4208391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
437e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4408391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
457e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4608391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
477e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4808391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
49e61fc153SStefano Zampini     delete w;
507e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
517e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
527e8381f9SStefano Zampini   } else {
537e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
547e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini   }
567e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
577e8381f9SStefano Zampini   A->num_ass++;
587e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
597e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
604eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
617e8381f9SStefano Zampini   PetscFunctionReturn(0);
627e8381f9SStefano Zampini }
637e8381f9SStefano Zampini 
647e8381f9SStefano Zampini template <typename Tuple>
657e8381f9SStefano Zampini struct IsNotOffDiagT
667e8381f9SStefano Zampini {
677e8381f9SStefano Zampini   PetscInt _cstart,_cend;
687e8381f9SStefano Zampini 
697e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
707e8381f9SStefano Zampini   __host__ __device__
717e8381f9SStefano Zampini   inline bool operator()(Tuple t)
727e8381f9SStefano Zampini   {
737e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
747e8381f9SStefano Zampini   }
757e8381f9SStefano Zampini };
767e8381f9SStefano Zampini 
777e8381f9SStefano Zampini struct IsOffDiag
787e8381f9SStefano Zampini {
797e8381f9SStefano Zampini   PetscInt _cstart,_cend;
807e8381f9SStefano Zampini 
817e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
827e8381f9SStefano Zampini   __host__ __device__
837e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
847e8381f9SStefano Zampini   {
857e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
867e8381f9SStefano Zampini   }
877e8381f9SStefano Zampini };
887e8381f9SStefano Zampini 
897e8381f9SStefano Zampini struct GlobToLoc
907e8381f9SStefano Zampini {
917e8381f9SStefano Zampini   PetscInt _start;
927e8381f9SStefano Zampini 
937e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
947e8381f9SStefano Zampini   __host__ __device__
957e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
967e8381f9SStefano Zampini   {
977e8381f9SStefano Zampini     return c - _start;
987e8381f9SStefano Zampini   }
997e8381f9SStefano Zampini };
1007e8381f9SStefano Zampini 
1017e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1027e8381f9SStefano Zampini {
1037e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1047e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1057e8381f9SStefano Zampini   PetscErrorCode         ierr;
1067e8381f9SStefano Zampini   PetscInt               *jj;
1077e8381f9SStefano Zampini   size_t                 noff = 0;
108ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1097e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1107e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1117e8381f9SStefano Zampini   cudaError_t            cerr;
1127e8381f9SStefano Zampini 
1137e8381f9SStefano Zampini   PetscFunctionBegin;
1147e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1157e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1167e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1177e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1187e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1197e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1227e8381f9SStefano Zampini 
1237e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1247e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1257e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1267e8381f9SStefano Zampini   delete cusp->coo_p;
1277e8381f9SStefano Zampini   delete cusp->coo_pw;
1287e8381f9SStefano Zampini   cusp->coo_p = NULL;
1297e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13008391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1317e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1327e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1337e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1347e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1357e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1367e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1377e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1387e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1397e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1407e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1417e8381f9SStefano Zampini   }
1427e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1437e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1447e8381f9SStefano Zampini 
1457e8381f9SStefano Zampini   /* from global to local */
1467e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1477e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
14808391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1497e8381f9SStefano Zampini 
1507e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
151ddea5d60SJunchao Zhang   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
1527e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1537e8381f9SStefano Zampini   auto o_j = d_j.begin();
15408391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1567e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
157ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
15808391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1597e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
160ddea5d60SJunchao Zhang   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
1617e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1627e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1637e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1647e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
1657e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
1667e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
1677e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1687e8381f9SStefano Zampini 
1697e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1707e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1757e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1767e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1777e8381f9SStefano Zampini 
1787e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1797e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1807e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1817e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1827e8381f9SStefano Zampini 
1837e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1847e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1857e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1867e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
187a0e72f99SJunchao Zhang   /*
1887e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1897e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
190a0e72f99SJunchao Zhang   */
1917e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1927e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1937e8381f9SStefano Zampini   B->nonzerostate++;
1947e8381f9SStefano Zampini 
1957e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
1967e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1987e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
1997e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2007e8381f9SStefano Zampini   PetscFunctionReturn(0);
2017e8381f9SStefano Zampini }
2029ae82921SPaul Mullowney 
203ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204ed502f03SStefano Zampini {
205ed502f03SStefano Zampini   Mat            Ad,Ao;
206ed502f03SStefano Zampini   const PetscInt *cmap;
207ed502f03SStefano Zampini   PetscErrorCode ierr;
208ed502f03SStefano Zampini 
209ed502f03SStefano Zampini   PetscFunctionBegin;
210ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
211ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
212ed502f03SStefano Zampini   if (glob) {
213ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
214ed502f03SStefano Zampini 
215ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
216ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
217ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
218ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
219ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
220ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
222ed502f03SStefano Zampini   }
223ed502f03SStefano Zampini   PetscFunctionReturn(0);
224ed502f03SStefano Zampini }
225ed502f03SStefano Zampini 
2269ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2279ae82921SPaul Mullowney {
228bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
229bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2309ae82921SPaul Mullowney   PetscErrorCode     ierr;
2319ae82921SPaul Mullowney   PetscInt           i;
2329ae82921SPaul Mullowney 
2339ae82921SPaul Mullowney   PetscFunctionBegin;
2349ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2359ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2367e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2379ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2389ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2399ae82921SPaul Mullowney     }
2409ae82921SPaul Mullowney   }
2417e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2429ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2439ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2449ae82921SPaul Mullowney     }
2459ae82921SPaul Mullowney   }
2466a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
2476a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
2486a29ce69SStefano Zampini #else
2496a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
2506a29ce69SStefano Zampini #endif
2516a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
2526a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
2536a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
2546a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2556a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
2566a29ce69SStefano Zampini   if (!b->A) {
2579ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
2589ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2593bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2606a29ce69SStefano Zampini   }
2616a29ce69SStefano Zampini   if (!b->B) {
2626a29ce69SStefano Zampini     PetscMPIInt size;
26355b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
2649ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
2656a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
2663bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2679ae82921SPaul Mullowney   }
268d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
269d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2706a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2716a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2729ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2739ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
274e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
275e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
276b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
277b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
278a0e72f99SJunchao Zhang   /* Let A, B use b's handle with pre-set stream
279b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
280b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
281a0e72f99SJunchao Zhang   */
2829ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2839ae82921SPaul Mullowney   PetscFunctionReturn(0);
2849ae82921SPaul Mullowney }
285e057df02SPaul Mullowney 
2869ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2879ae82921SPaul Mullowney {
2889ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
2899ae82921SPaul Mullowney   PetscErrorCode ierr;
2909ae82921SPaul Mullowney   PetscInt       nt;
2919ae82921SPaul Mullowney 
2929ae82921SPaul Mullowney   PetscFunctionBegin;
2939ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
2949ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
2959ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2964d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
2979ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2989ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
2999ae82921SPaul Mullowney   PetscFunctionReturn(0);
3009ae82921SPaul Mullowney }
301ca45077fSPaul Mullowney 
3023fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
3033fa6b06aSMark Adams {
3043fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
3053fa6b06aSMark Adams   PetscErrorCode ierr;
3063fa6b06aSMark Adams 
3073fa6b06aSMark Adams   PetscFunctionBegin;
3083fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
3093fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
3103fa6b06aSMark Adams   PetscFunctionReturn(0);
3113fa6b06aSMark Adams }
312042217e8SBarry Smith 
313fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
314fdc842d1SBarry Smith {
315fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
316fdc842d1SBarry Smith   PetscErrorCode ierr;
317fdc842d1SBarry Smith   PetscInt       nt;
318fdc842d1SBarry Smith 
319fdc842d1SBarry Smith   PetscFunctionBegin;
320fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
321fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
322fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3234d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
324fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
325fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
326fdc842d1SBarry Smith   PetscFunctionReturn(0);
327fdc842d1SBarry Smith }
328fdc842d1SBarry Smith 
329ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
330ca45077fSPaul Mullowney {
331ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
332ca45077fSPaul Mullowney   PetscErrorCode ierr;
333ca45077fSPaul Mullowney   PetscInt       nt;
334ca45077fSPaul Mullowney 
335ca45077fSPaul Mullowney   PetscFunctionBegin;
336ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
337ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
3389b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
339ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
3409b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
3419b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
342ca45077fSPaul Mullowney   PetscFunctionReturn(0);
343ca45077fSPaul Mullowney }
3449ae82921SPaul Mullowney 
345e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
346ca45077fSPaul Mullowney {
347ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
348bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
349e057df02SPaul Mullowney 
350ca45077fSPaul Mullowney   PetscFunctionBegin;
351e057df02SPaul Mullowney   switch (op) {
352e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
353e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
354045c96e1SPaul Mullowney     break;
355e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
356e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
357045c96e1SPaul Mullowney     break;
358e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
359e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
360e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
361045c96e1SPaul Mullowney     break;
362e057df02SPaul Mullowney   default:
363e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
364045c96e1SPaul Mullowney   }
365ca45077fSPaul Mullowney   PetscFunctionReturn(0);
366ca45077fSPaul Mullowney }
367e057df02SPaul Mullowney 
3684416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
369e057df02SPaul Mullowney {
370e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
371e057df02SPaul Mullowney   PetscErrorCode           ierr;
372e057df02SPaul Mullowney   PetscBool                flg;
373a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
374a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
3755fd66863SKarl Rupp 
376e057df02SPaul Mullowney   PetscFunctionBegin;
377e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
378e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
379e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
380a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
381e057df02SPaul Mullowney     if (flg) {
382e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
383e057df02SPaul Mullowney     }
384e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
385a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
386e057df02SPaul Mullowney     if (flg) {
387e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
388e057df02SPaul Mullowney     }
389e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
390a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
391e057df02SPaul Mullowney     if (flg) {
392e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
393e057df02SPaul Mullowney     }
394e057df02SPaul Mullowney   }
3950af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
396e057df02SPaul Mullowney   PetscFunctionReturn(0);
397e057df02SPaul Mullowney }
398e057df02SPaul Mullowney 
39934d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
40034d6c7a5SJose E. Roman {
40134d6c7a5SJose E. Roman   PetscErrorCode     ierr;
4023fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
403042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
404042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
405042217e8SBarry Smith 
40634d6c7a5SJose E. Roman   PetscFunctionBegin;
40734d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
408042217e8SBarry Smith   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
409042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
410042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
411042217e8SBarry Smith     cudaError_t                cerr;
412042217e8SBarry Smith 
413042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
414042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
415042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
416042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
417042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
418042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
419042217e8SBarry Smith     cusp->deviceMat = NULL;
4203fa6b06aSMark Adams   }
42134d6c7a5SJose E. Roman   PetscFunctionReturn(0);
42234d6c7a5SJose E. Roman }
42334d6c7a5SJose E. Roman 
424bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
425bbf3fe20SPaul Mullowney {
426bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
4273fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
4283fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
429b06137fdSPaul Mullowney   cusparseStatus_t   stat;
430bbf3fe20SPaul Mullowney 
431bbf3fe20SPaul Mullowney   PetscFunctionBegin;
432d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
4333fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
434042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
435042217e8SBarry Smith     cudaError_t                cerr;
436042217e8SBarry Smith 
4373fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
438042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
439042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
440042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
441042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
442042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
4433fa6b06aSMark Adams   }
444bbf3fe20SPaul Mullowney   try {
4453fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
4463fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
44757d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
448a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
44917403302SKarl Rupp     if (cusparseStruct->stream) {
450042217e8SBarry Smith       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
45117403302SKarl Rupp     }
452a0e72f99SJunchao Zhang     */
4537e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
4547e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
455bbf3fe20SPaul Mullowney     delete cusparseStruct;
456bbf3fe20SPaul Mullowney   } catch(char *ex) {
457bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
458bbf3fe20SPaul Mullowney   }
4593338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
460ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
4617e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
4627e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
4633338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
464ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
465bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
466bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
467bbf3fe20SPaul Mullowney }
468ca45077fSPaul Mullowney 
4693338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
4709ae82921SPaul Mullowney {
4719ae82921SPaul Mullowney   PetscErrorCode     ierr;
472bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
473bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
474b06137fdSPaul Mullowney   cusparseStatus_t   stat;
4753338378cSStefano Zampini   Mat                A;
4769ae82921SPaul Mullowney 
4779ae82921SPaul Mullowney   PetscFunctionBegin;
4783338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4793338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4803338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
4813338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
4823338378cSStefano Zampini   }
4833338378cSStefano Zampini   A = *newmat;
4846f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
48534136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
48634136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
48734136279SStefano Zampini 
488bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
489d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
490d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
491d98d7c49SStefano Zampini   if (a->lvec) {
492d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
493d98d7c49SStefano Zampini   }
494d98d7c49SStefano Zampini 
4953338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
496bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
4972205254eSKarl Rupp 
498bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
499e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
500e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
5017e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
5027e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
503042217e8SBarry Smith     cusparseStruct->stream              = 0;
5043fa6b06aSMark Adams     cusparseStruct->deviceMat           = NULL;
505042217e8SBarry Smith     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
5063338378cSStefano Zampini   }
5072205254eSKarl Rupp 
50834d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
509bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
510fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
511bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
512bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
513bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
5143fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
5154e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
5162205254eSKarl Rupp 
517bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
518ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
5193338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
520bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
5217e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5227e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
523ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
524ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
525ae48a8d0SStefano Zampini #endif
5269ae82921SPaul Mullowney   PetscFunctionReturn(0);
5279ae82921SPaul Mullowney }
5289ae82921SPaul Mullowney 
5293338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
5303338378cSStefano Zampini {
5313338378cSStefano Zampini   PetscErrorCode ierr;
5323338378cSStefano Zampini 
5333338378cSStefano Zampini   PetscFunctionBegin;
534*a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
5353338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
5363338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5373338378cSStefano Zampini   PetscFunctionReturn(0);
5383338378cSStefano Zampini }
5393338378cSStefano Zampini 
5403f9c0db1SPaul Mullowney /*@
5413f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
542e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
5433f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
544e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
545e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
546e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
5479ae82921SPaul Mullowney 
548d083f849SBarry Smith    Collective
549e057df02SPaul Mullowney 
550e057df02SPaul Mullowney    Input Parameters:
551e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
552e057df02SPaul Mullowney .  m - number of rows
553e057df02SPaul Mullowney .  n - number of columns
554e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
555e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
5560298fd71SBarry Smith          (possibly different for each row) or NULL
557e057df02SPaul Mullowney 
558e057df02SPaul Mullowney    Output Parameter:
559e057df02SPaul Mullowney .  A - the matrix
560e057df02SPaul Mullowney 
561e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
562e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
563e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
564e057df02SPaul Mullowney 
565e057df02SPaul Mullowney    Notes:
566e057df02SPaul Mullowney    If nnz is given then nz is ignored
567e057df02SPaul Mullowney 
568e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
569e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
570e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
571e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
572e057df02SPaul Mullowney 
573e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
5740298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
575e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
576e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
577e057df02SPaul Mullowney 
578e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
579e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
580e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
581e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
582e057df02SPaul Mullowney 
583e057df02SPaul Mullowney    Level: intermediate
584e057df02SPaul Mullowney 
585e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
586e057df02SPaul Mullowney @*/
5879ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
5889ae82921SPaul Mullowney {
5899ae82921SPaul Mullowney   PetscErrorCode ierr;
5909ae82921SPaul Mullowney   PetscMPIInt    size;
5919ae82921SPaul Mullowney 
5929ae82921SPaul Mullowney   PetscFunctionBegin;
5939ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
5949ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
595ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
5969ae82921SPaul Mullowney   if (size > 1) {
5979ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
5989ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
5999ae82921SPaul Mullowney   } else {
6009ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
6019ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
6029ae82921SPaul Mullowney   }
6039ae82921SPaul Mullowney   PetscFunctionReturn(0);
6049ae82921SPaul Mullowney }
6059ae82921SPaul Mullowney 
6063ca39a21SBarry Smith /*MC
6076bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
608e057df02SPaul Mullowney 
6092692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
6102692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
6112692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
6129ae82921SPaul Mullowney 
6139ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
6149ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
6159ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
6169ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
6179ae82921SPaul Mullowney    the above preallocation routines for simplicity.
6189ae82921SPaul Mullowney 
6199ae82921SPaul Mullowney    Options Database Keys:
620e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
6218468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6228468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6238468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6249ae82921SPaul Mullowney 
6259ae82921SPaul Mullowney   Level: beginner
6269ae82921SPaul Mullowney 
6276bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
6286bb69460SJunchao Zhang M*/
6296bb69460SJunchao Zhang 
6306bb69460SJunchao Zhang /*MC
6316bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
6326bb69460SJunchao Zhang 
6336bb69460SJunchao Zhang   Level: beginner
6346bb69460SJunchao Zhang 
6356bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
6369ae82921SPaul Mullowney M*/
6373fa6b06aSMark Adams 
638042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
639042217e8SBarry Smith 
640042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
641042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
6423fa6b06aSMark Adams {
643042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
644042217e8SBarry Smith   PetscMPIInt                size;
6453fa6b06aSMark Adams   PetscErrorCode             ierr;
646042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
647042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
648042217e8SBarry Smith   Mat_SeqAIJ                 *jaca = NULL;
649042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
650042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
6513fa6b06aSMark Adams 
6523fa6b06aSMark Adams   PetscFunctionBegin;
653042217e8SBarry Smith   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
654042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
6553fa6b06aSMark Adams     *B = NULL;
6563fa6b06aSMark Adams     PetscFunctionReturn(0);
6573fa6b06aSMark Adams   }
658042217e8SBarry Smith   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
6593fa6b06aSMark Adams   if (size == 1) {
660042217e8SBarry Smith     PetscBool isseqaij;
661042217e8SBarry Smith 
662042217e8SBarry Smith     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
663042217e8SBarry Smith     if (isseqaij) {
6643fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
665042217e8SBarry Smith       if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
666042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
667042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
668042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
6693fa6b06aSMark Adams     } else {
6703fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
671042217e8SBarry Smith       if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
672042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
6733fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
674042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
675042217e8SBarry Smith       d_mat = spptr->deviceMat;
676042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
6773fa6b06aSMark Adams     }
678042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
679042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
680042217e8SBarry Smith       if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
681042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
682042217e8SBarry Smith       bi = NULL;
683042217e8SBarry Smith       bj = NULL;
684042217e8SBarry Smith       ba = NULL;
685042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
686042217e8SBarry Smith   } else {
687042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
688042217e8SBarry Smith     if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
689042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
690042217e8SBarry Smith     Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
691042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
6923fa6b06aSMark Adams 
693042217e8SBarry Smith     if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
694042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
695042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
696042217e8SBarry Smith     if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
697042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
698042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
699042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
700042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
701042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
702042217e8SBarry Smith       if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
703042217e8SBarry Smith       if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
704042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
705042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
7063fa6b06aSMark Adams       if (jacb->compressedrow.use) {
707042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
708042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
709042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
7103fa6b06aSMark Adams         }
711042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
712042217e8SBarry Smith       } else {
713042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
714042217e8SBarry Smith       }
715042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
716042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
717042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
718042217e8SBarry Smith     d_mat = spptr->deviceMat;
719042217e8SBarry Smith   }
7203fa6b06aSMark Adams   if (jaca->compressedrow.use) {
721042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
722042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
723042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
7243fa6b06aSMark Adams     }
725042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
7263fa6b06aSMark Adams   } else {
727042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
7283fa6b06aSMark Adams   }
729042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
730042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
731042217e8SBarry Smith 
732042217e8SBarry Smith   if (!d_mat) {
733042217e8SBarry Smith     cudaError_t                cerr;
734042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
735042217e8SBarry Smith 
736042217e8SBarry Smith     // create and populate strucy on host and copy on device
737042217e8SBarry Smith     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
738042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
739042217e8SBarry Smith     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
740042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
741042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
742042217e8SBarry Smith       int        *colmap;
743042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
744042217e8SBarry Smith 
745b3c64f9dSJunchao Zhang       if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
746042217e8SBarry Smith 
747042217e8SBarry Smith       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
748042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
749042217e8SBarry Smith 
750042217e8SBarry Smith       h_mat->offdiag.i = bi;
751042217e8SBarry Smith       h_mat->offdiag.j = bj;
752042217e8SBarry Smith       h_mat->offdiag.a = ba;
753042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
754042217e8SBarry Smith 
755042217e8SBarry Smith       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr);
756042217e8SBarry Smith       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
757042217e8SBarry Smith       ierr = PetscFree(colmap);CHKERRQ(ierr);
758042217e8SBarry Smith     }
759042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
760042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
761042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
762042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
763042217e8SBarry Smith     h_mat->N      = A->cmap->N;
764042217e8SBarry Smith     h_mat->diag.i = ai;
765042217e8SBarry Smith     h_mat->diag.j = aj;
766042217e8SBarry Smith     h_mat->diag.a = aa;
767042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
768042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
769042217e8SBarry Smith     // copy pointers and metadata to device
770042217e8SBarry Smith     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
771042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
772042217e8SBarry Smith   } else {
773042217e8SBarry Smith     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
774042217e8SBarry Smith   }
775042217e8SBarry Smith   *B = d_mat;
776042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
7773fa6b06aSMark Adams   PetscFunctionReturn(0);
7783fa6b06aSMark Adams }
779