xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2c71b3e237ead271e4f3aa1505f92bf476e3413d)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
147e8381f9SStefano Zampini struct VecCUDAEquals
157e8381f9SStefano Zampini {
167e8381f9SStefano Zampini   template <typename Tuple>
177e8381f9SStefano Zampini   __host__ __device__
187e8381f9SStefano Zampini   void operator()(Tuple t)
197e8381f9SStefano Zampini   {
207e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
217e8381f9SStefano Zampini   }
227e8381f9SStefano Zampini };
237e8381f9SStefano Zampini 
247e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
257e8381f9SStefano Zampini {
267e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
277e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
287e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
297e8381f9SStefano Zampini   PetscErrorCode     ierr;
307e8381f9SStefano Zampini 
317e8381f9SStefano Zampini   PetscFunctionBegin;
32e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3308391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3408391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3508391a17SStefano Zampini 
3608391a17SStefano Zampini     if (isCudaMem(v)) {
3708391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3808391a17SStefano Zampini     } else {
39e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
40e61fc153SStefano Zampini       w->assign(v,v+n);
4108391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
4208391a17SStefano Zampini       d_v = w->data();
4308391a17SStefano Zampini     }
4408391a17SStefano Zampini 
4508391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
467e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4708391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
487e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
507e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5108391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52e61fc153SStefano Zampini     delete w;
537e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
547e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini   } else {
567e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
577e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini   }
597e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
607e8381f9SStefano Zampini   A->num_ass++;
617e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
627e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
634eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
647e8381f9SStefano Zampini   PetscFunctionReturn(0);
657e8381f9SStefano Zampini }
667e8381f9SStefano Zampini 
677e8381f9SStefano Zampini template <typename Tuple>
687e8381f9SStefano Zampini struct IsNotOffDiagT
697e8381f9SStefano Zampini {
707e8381f9SStefano Zampini   PetscInt _cstart,_cend;
717e8381f9SStefano Zampini 
727e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
737e8381f9SStefano Zampini   __host__ __device__
747e8381f9SStefano Zampini   inline bool operator()(Tuple t)
757e8381f9SStefano Zampini   {
767e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
777e8381f9SStefano Zampini   }
787e8381f9SStefano Zampini };
797e8381f9SStefano Zampini 
807e8381f9SStefano Zampini struct IsOffDiag
817e8381f9SStefano Zampini {
827e8381f9SStefano Zampini   PetscInt _cstart,_cend;
837e8381f9SStefano Zampini 
847e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
857e8381f9SStefano Zampini   __host__ __device__
867e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
877e8381f9SStefano Zampini   {
887e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
897e8381f9SStefano Zampini   }
907e8381f9SStefano Zampini };
917e8381f9SStefano Zampini 
927e8381f9SStefano Zampini struct GlobToLoc
937e8381f9SStefano Zampini {
947e8381f9SStefano Zampini   PetscInt _start;
957e8381f9SStefano Zampini 
967e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
977e8381f9SStefano Zampini   __host__ __device__
987e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
997e8381f9SStefano Zampini   {
1007e8381f9SStefano Zampini     return c - _start;
1017e8381f9SStefano Zampini   }
1027e8381f9SStefano Zampini };
1037e8381f9SStefano Zampini 
10482a78a4eSJed Brown static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
1057e8381f9SStefano Zampini {
1067e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1077e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1087e8381f9SStefano Zampini   PetscErrorCode         ierr;
10982a78a4eSJed Brown   PetscInt               N,*jj;
1107e8381f9SStefano Zampini   size_t                 noff = 0;
111ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1127e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1137e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1147e8381f9SStefano Zampini   cudaError_t            cerr;
1157e8381f9SStefano Zampini 
1167e8381f9SStefano Zampini   PetscFunctionBegin;
1177e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1187e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1197e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1207e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1217e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1237e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1247e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1257e8381f9SStefano Zampini 
1267e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1277e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1287e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1297e8381f9SStefano Zampini   delete cusp->coo_p;
1307e8381f9SStefano Zampini   delete cusp->coo_pw;
1317e8381f9SStefano Zampini   cusp->coo_p = NULL;
1327e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13308391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1347e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1357e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1367e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1377e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1387e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1397e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1407e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1417e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1427e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1437e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1447e8381f9SStefano Zampini   }
1457e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1467e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1477e8381f9SStefano Zampini 
1487e8381f9SStefano Zampini   /* from global to local */
1497e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1507e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
15108391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1527e8381f9SStefano Zampini 
1537e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
154ddea5d60SJunchao Zhang   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
1557e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1567e8381f9SStefano Zampini   auto o_j = d_j.begin();
15708391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
158ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1597e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
160ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
16108391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1627e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
163ddea5d60SJunchao Zhang   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
1647e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1657e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1667e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1677e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
16882a78a4eSJed Brown   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
169*2c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
1707e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1717e8381f9SStefano Zampini 
1727e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1757e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1767e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1777e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1787e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1797e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1807e8381f9SStefano Zampini 
1817e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1827e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1837e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1847e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1857e8381f9SStefano Zampini 
1867e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1877e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1887e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1897e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
190a0e72f99SJunchao Zhang   /*
1917e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1927e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
193a0e72f99SJunchao Zhang   */
1947e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1957e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1967e8381f9SStefano Zampini   B->nonzerostate++;
1977e8381f9SStefano Zampini 
1987e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
1997e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2007e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
2017e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2027e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2037e8381f9SStefano Zampini   PetscFunctionReturn(0);
2047e8381f9SStefano Zampini }
2059ae82921SPaul Mullowney 
206ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
207ed502f03SStefano Zampini {
208ed502f03SStefano Zampini   Mat            Ad,Ao;
209ed502f03SStefano Zampini   const PetscInt *cmap;
210ed502f03SStefano Zampini   PetscErrorCode ierr;
211ed502f03SStefano Zampini 
212ed502f03SStefano Zampini   PetscFunctionBegin;
213ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
214ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
215ed502f03SStefano Zampini   if (glob) {
216ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
217ed502f03SStefano Zampini 
218ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
219ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
220ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
221ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
222ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
223ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
224ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
225ed502f03SStefano Zampini   }
226ed502f03SStefano Zampini   PetscFunctionReturn(0);
227ed502f03SStefano Zampini }
228ed502f03SStefano Zampini 
2299ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2309ae82921SPaul Mullowney {
231bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
232bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2339ae82921SPaul Mullowney   PetscErrorCode     ierr;
2349ae82921SPaul Mullowney   PetscInt           i;
2359ae82921SPaul Mullowney 
2369ae82921SPaul Mullowney   PetscFunctionBegin;
2379ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2389ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2397e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2409ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
241*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
2429ae82921SPaul Mullowney     }
2439ae82921SPaul Mullowney   }
2447e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2459ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
246*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
2479ae82921SPaul Mullowney     }
2489ae82921SPaul Mullowney   }
2496a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
2506a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
2516a29ce69SStefano Zampini #else
2526a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
2536a29ce69SStefano Zampini #endif
2546a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
2556a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
2566a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
2576a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2586a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
2596a29ce69SStefano Zampini   if (!b->A) {
2609ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
2619ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2623bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2636a29ce69SStefano Zampini   }
2646a29ce69SStefano Zampini   if (!b->B) {
2656a29ce69SStefano Zampini     PetscMPIInt size;
26655b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
2679ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
2686a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
2693bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2709ae82921SPaul Mullowney   }
271d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
272d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2736a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2746a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2759ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2769ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
277e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
278e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
279b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
280b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
281a0e72f99SJunchao Zhang   /* Let A, B use b's handle with pre-set stream
282b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
283b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
284a0e72f99SJunchao Zhang   */
2859ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2869ae82921SPaul Mullowney   PetscFunctionReturn(0);
2879ae82921SPaul Mullowney }
288e057df02SPaul Mullowney 
2899ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2909ae82921SPaul Mullowney {
2919ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
2929ae82921SPaul Mullowney   PetscErrorCode ierr;
2939ae82921SPaul Mullowney   PetscInt       nt;
2949ae82921SPaul Mullowney 
2959ae82921SPaul Mullowney   PetscFunctionBegin;
2969ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
297*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
29856258e06SRichard Tran Mills   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
29956258e06SRichard Tran Mills   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
3009ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3014d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
3029ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3039ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
3049ae82921SPaul Mullowney   PetscFunctionReturn(0);
3059ae82921SPaul Mullowney }
306ca45077fSPaul Mullowney 
3073fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
3083fa6b06aSMark Adams {
3093fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
3103fa6b06aSMark Adams   PetscErrorCode ierr;
3113fa6b06aSMark Adams 
3123fa6b06aSMark Adams   PetscFunctionBegin;
3133fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
3143fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
3153fa6b06aSMark Adams   PetscFunctionReturn(0);
3163fa6b06aSMark Adams }
317042217e8SBarry Smith 
318fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
319fdc842d1SBarry Smith {
320fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
321fdc842d1SBarry Smith   PetscErrorCode ierr;
322fdc842d1SBarry Smith   PetscInt       nt;
323fdc842d1SBarry Smith 
324fdc842d1SBarry Smith   PetscFunctionBegin;
325fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
326*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
327fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3284d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
329fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
330fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
331fdc842d1SBarry Smith   PetscFunctionReturn(0);
332fdc842d1SBarry Smith }
333fdc842d1SBarry Smith 
334ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
335ca45077fSPaul Mullowney {
336ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
337ca45077fSPaul Mullowney   PetscErrorCode ierr;
338ca45077fSPaul Mullowney   PetscInt       nt;
339ca45077fSPaul Mullowney 
340ca45077fSPaul Mullowney   PetscFunctionBegin;
341ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
342*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
3439b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
344ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
3459b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
3469b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
347ca45077fSPaul Mullowney   PetscFunctionReturn(0);
348ca45077fSPaul Mullowney }
3499ae82921SPaul Mullowney 
350e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
351ca45077fSPaul Mullowney {
352ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
353bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
354e057df02SPaul Mullowney 
355ca45077fSPaul Mullowney   PetscFunctionBegin;
356e057df02SPaul Mullowney   switch (op) {
357e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
358e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
359045c96e1SPaul Mullowney     break;
360e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
361e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
362045c96e1SPaul Mullowney     break;
363e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
364e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
365e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
366045c96e1SPaul Mullowney     break;
367e057df02SPaul Mullowney   default:
36898921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
369045c96e1SPaul Mullowney   }
370ca45077fSPaul Mullowney   PetscFunctionReturn(0);
371ca45077fSPaul Mullowney }
372e057df02SPaul Mullowney 
3734416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
374e057df02SPaul Mullowney {
375e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
376e057df02SPaul Mullowney   PetscErrorCode           ierr;
377e057df02SPaul Mullowney   PetscBool                flg;
378a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
379a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
3805fd66863SKarl Rupp 
381e057df02SPaul Mullowney   PetscFunctionBegin;
382e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
383e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
384e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
385a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
386e057df02SPaul Mullowney     if (flg) {
387e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
388e057df02SPaul Mullowney     }
389e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
390a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
391e057df02SPaul Mullowney     if (flg) {
392e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
393e057df02SPaul Mullowney     }
394e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
395a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
396e057df02SPaul Mullowney     if (flg) {
397e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
398e057df02SPaul Mullowney     }
399e057df02SPaul Mullowney   }
4000af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
401e057df02SPaul Mullowney   PetscFunctionReturn(0);
402e057df02SPaul Mullowney }
403e057df02SPaul Mullowney 
40434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
40534d6c7a5SJose E. Roman {
40634d6c7a5SJose E. Roman   PetscErrorCode     ierr;
4073fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
408042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
409042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
410042217e8SBarry Smith 
41134d6c7a5SJose E. Roman   PetscFunctionBegin;
41234d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
413042217e8SBarry Smith   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
414042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
415042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
416042217e8SBarry Smith     cudaError_t                cerr;
417042217e8SBarry Smith 
418042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
419042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
420042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
421042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
42299551766SMark Adams     if (h_mat->allocated_indices) {
42399551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
42499551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
42599551766SMark Adams       if (h_mat->offdiag.j) {
42699551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
42799551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
42899551766SMark Adams       }
42999551766SMark Adams     }
430042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
431042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
432042217e8SBarry Smith     cusp->deviceMat = NULL;
4333fa6b06aSMark Adams   }
43434d6c7a5SJose E. Roman   PetscFunctionReturn(0);
43534d6c7a5SJose E. Roman }
43634d6c7a5SJose E. Roman 
437bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
438bbf3fe20SPaul Mullowney {
439bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
4403fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
4413fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
442b06137fdSPaul Mullowney   cusparseStatus_t   stat;
443bbf3fe20SPaul Mullowney 
444bbf3fe20SPaul Mullowney   PetscFunctionBegin;
445*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
4463fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
447042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
448042217e8SBarry Smith     cudaError_t                cerr;
449042217e8SBarry Smith 
4503fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
451042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
452042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
453042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
45499551766SMark Adams     if (h_mat->allocated_indices) {
45599551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
45699551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
45799551766SMark Adams       if (h_mat->offdiag.j) {
45899551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
45999551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
46099551766SMark Adams       }
46199551766SMark Adams     }
462042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
463042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
4643fa6b06aSMark Adams   }
465bbf3fe20SPaul Mullowney   try {
4663fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
4673fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
46857d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
469a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
47017403302SKarl Rupp     if (cusparseStruct->stream) {
471042217e8SBarry Smith       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
47217403302SKarl Rupp     }
473a0e72f99SJunchao Zhang     */
4747e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
4757e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
476bbf3fe20SPaul Mullowney     delete cusparseStruct;
477bbf3fe20SPaul Mullowney   } catch(char *ex) {
47898921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
479bbf3fe20SPaul Mullowney   }
4803338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
481ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
4827e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
4837e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
4843338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
485ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
486bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
487bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
488bbf3fe20SPaul Mullowney }
489ca45077fSPaul Mullowney 
4903338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
4919ae82921SPaul Mullowney {
4929ae82921SPaul Mullowney   PetscErrorCode     ierr;
493bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
494bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
495b06137fdSPaul Mullowney   cusparseStatus_t   stat;
4963338378cSStefano Zampini   Mat                A;
4979ae82921SPaul Mullowney 
4989ae82921SPaul Mullowney   PetscFunctionBegin;
4993338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
5003338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
5013338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
5023338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
5033338378cSStefano Zampini   }
5043338378cSStefano Zampini   A = *newmat;
5056f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
50634136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
50734136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
50834136279SStefano Zampini 
509bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
510d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
511d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
512d98d7c49SStefano Zampini   if (a->lvec) {
513d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
514d98d7c49SStefano Zampini   }
515d98d7c49SStefano Zampini 
5163338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
517bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
5182205254eSKarl Rupp 
519bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
520e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
521e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
5227e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
5237e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
524042217e8SBarry Smith     cusparseStruct->stream              = 0;
5253fa6b06aSMark Adams     cusparseStruct->deviceMat           = NULL;
526042217e8SBarry Smith     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
5273338378cSStefano Zampini   }
5282205254eSKarl Rupp 
52934d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
530bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
531fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
532bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
533bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
534bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
5353fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
5364e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
5372205254eSKarl Rupp 
538bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
539ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
5403338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
541bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
5427e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5437e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
544ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
545ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
546ae48a8d0SStefano Zampini #endif
5479ae82921SPaul Mullowney   PetscFunctionReturn(0);
5489ae82921SPaul Mullowney }
5499ae82921SPaul Mullowney 
5503338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
5513338378cSStefano Zampini {
5523338378cSStefano Zampini   PetscErrorCode ierr;
5533338378cSStefano Zampini 
5543338378cSStefano Zampini   PetscFunctionBegin;
555a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
5563338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
5573338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5583338378cSStefano Zampini   PetscFunctionReturn(0);
5593338378cSStefano Zampini }
5603338378cSStefano Zampini 
5613f9c0db1SPaul Mullowney /*@
5623f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
563e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
5643f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
565e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
566e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
567e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
5689ae82921SPaul Mullowney 
569d083f849SBarry Smith    Collective
570e057df02SPaul Mullowney 
571e057df02SPaul Mullowney    Input Parameters:
572e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
573e057df02SPaul Mullowney .  m - number of rows
574e057df02SPaul Mullowney .  n - number of columns
575e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
576e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
5770298fd71SBarry Smith          (possibly different for each row) or NULL
578e057df02SPaul Mullowney 
579e057df02SPaul Mullowney    Output Parameter:
580e057df02SPaul Mullowney .  A - the matrix
581e057df02SPaul Mullowney 
582e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
583e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
584e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
585e057df02SPaul Mullowney 
586e057df02SPaul Mullowney    Notes:
587e057df02SPaul Mullowney    If nnz is given then nz is ignored
588e057df02SPaul Mullowney 
589e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
590e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
591e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
592e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
593e057df02SPaul Mullowney 
594e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
5950298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
596e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
597e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
598e057df02SPaul Mullowney 
599e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
600e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
601e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
602e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
603e057df02SPaul Mullowney 
604e057df02SPaul Mullowney    Level: intermediate
605e057df02SPaul Mullowney 
606e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
607e057df02SPaul Mullowney @*/
6089ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
6099ae82921SPaul Mullowney {
6109ae82921SPaul Mullowney   PetscErrorCode ierr;
6119ae82921SPaul Mullowney   PetscMPIInt    size;
6129ae82921SPaul Mullowney 
6139ae82921SPaul Mullowney   PetscFunctionBegin;
6149ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
6159ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
616ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
6179ae82921SPaul Mullowney   if (size > 1) {
6189ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
6199ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
6209ae82921SPaul Mullowney   } else {
6219ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
6229ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
6239ae82921SPaul Mullowney   }
6249ae82921SPaul Mullowney   PetscFunctionReturn(0);
6259ae82921SPaul Mullowney }
6269ae82921SPaul Mullowney 
6273ca39a21SBarry Smith /*MC
6286bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
629e057df02SPaul Mullowney 
6302692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
6312692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
6322692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
6339ae82921SPaul Mullowney 
6349ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
6359ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
6369ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
6379ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
6389ae82921SPaul Mullowney    the above preallocation routines for simplicity.
6399ae82921SPaul Mullowney 
6409ae82921SPaul Mullowney    Options Database Keys:
641e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
6428468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6438468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6448468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6459ae82921SPaul Mullowney 
6469ae82921SPaul Mullowney   Level: beginner
6479ae82921SPaul Mullowney 
6486bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
6496bb69460SJunchao Zhang M*/
6506bb69460SJunchao Zhang 
6516bb69460SJunchao Zhang /*MC
6526bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
6536bb69460SJunchao Zhang 
6546bb69460SJunchao Zhang   Level: beginner
6556bb69460SJunchao Zhang 
6566bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
6579ae82921SPaul Mullowney M*/
6583fa6b06aSMark Adams 
659042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
660042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
6613fa6b06aSMark Adams {
662042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
663042217e8SBarry Smith   PetscMPIInt                size;
6643fa6b06aSMark Adams   PetscErrorCode             ierr;
665042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
666042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
66799551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
668042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
669042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
6703fa6b06aSMark Adams 
6713fa6b06aSMark Adams   PetscFunctionBegin;
672*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
673042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
6743fa6b06aSMark Adams     *B = NULL;
6753fa6b06aSMark Adams     PetscFunctionReturn(0);
6763fa6b06aSMark Adams   }
677042217e8SBarry Smith   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
67899551766SMark Adams   // get jaca
6793fa6b06aSMark Adams   if (size == 1) {
680042217e8SBarry Smith     PetscBool isseqaij;
681042217e8SBarry Smith 
682042217e8SBarry Smith     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
683042217e8SBarry Smith     if (isseqaij) {
6843fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
685*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
686042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
687042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
688042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
6893fa6b06aSMark Adams     } else {
6903fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
691*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
692042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
6933fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
694042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
695042217e8SBarry Smith       d_mat = spptr->deviceMat;
696042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
6973fa6b06aSMark Adams     }
698042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
699042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
700*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
701042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
702042217e8SBarry Smith       bi = NULL;
703042217e8SBarry Smith       bj = NULL;
704042217e8SBarry Smith       ba = NULL;
705042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
706042217e8SBarry Smith   } else {
707042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
708*2c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
709042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
71099551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
711042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
7123fa6b06aSMark Adams 
713*2c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
714042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
715042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
716*2c71b3e2SJacob Faibussowitsch     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
717042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
718042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
719042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
720042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
721042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
722*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
723*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
724042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
725042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
7263fa6b06aSMark Adams       if (jacb->compressedrow.use) {
727042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
728042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
729042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
7303fa6b06aSMark Adams         }
731042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
732042217e8SBarry Smith       } else {
733042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
734042217e8SBarry Smith       }
735042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
736042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
737042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
738042217e8SBarry Smith     d_mat = spptr->deviceMat;
739042217e8SBarry Smith   }
7403fa6b06aSMark Adams   if (jaca->compressedrow.use) {
741042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
742042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
743042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
7443fa6b06aSMark Adams     }
745042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
7463fa6b06aSMark Adams   } else {
747042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
7483fa6b06aSMark Adams   }
749042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
750042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
751042217e8SBarry Smith 
752042217e8SBarry Smith   if (!d_mat) {
753042217e8SBarry Smith     cudaError_t                cerr;
754042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
755042217e8SBarry Smith 
756042217e8SBarry Smith     // create and populate strucy on host and copy on device
757042217e8SBarry Smith     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
758042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
759042217e8SBarry Smith     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
760042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
761042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
76299551766SMark Adams       PetscInt   *colmap;
763042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
764042217e8SBarry Smith 
765*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
766042217e8SBarry Smith 
767042217e8SBarry Smith       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
768042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
769365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
770365b711fSMark Adams       { // have to make a long version of these
77199551766SMark Adams         int        *h_bi32, *h_bj32;
77299551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
773365b711fSMark Adams         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
77499551766SMark Adams         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
77599551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
77699551766SMark Adams         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
77799551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
77899551766SMark Adams 
77999551766SMark Adams         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
78099551766SMark Adams         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
78199551766SMark Adams         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
78299551766SMark Adams         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
78399551766SMark Adams 
78499551766SMark Adams         h_mat->offdiag.i = d_bi64;
78599551766SMark Adams         h_mat->offdiag.j = d_bj64;
78699551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
78799551766SMark Adams 
788365b711fSMark Adams         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
789365b711fSMark Adams       }
790365b711fSMark Adams #else
79199551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
79299551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
79399551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
794365b711fSMark Adams #endif
795042217e8SBarry Smith       h_mat->offdiag.a = ba;
796042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
797042217e8SBarry Smith 
79899551766SMark Adams       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
79999551766SMark Adams       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
800042217e8SBarry Smith       ierr = PetscFree(colmap);CHKERRQ(ierr);
801042217e8SBarry Smith     }
802042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
803042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
804042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
805042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
80649b994a9SMark Adams     h_mat->M      = A->cmap->N;
807365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
808365b711fSMark Adams     {
809*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
81099551766SMark Adams       int        *h_ai32, *h_aj32;
81199551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
812365b711fSMark Adams       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
81399551766SMark Adams       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
81499551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
81599551766SMark Adams       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
81699551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
81799551766SMark Adams 
81899551766SMark Adams       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
81999551766SMark Adams       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
82099551766SMark Adams       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
82199551766SMark Adams       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
82299551766SMark Adams 
82399551766SMark Adams       h_mat->diag.i = d_ai64;
82499551766SMark Adams       h_mat->diag.j = d_aj64;
82599551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
82699551766SMark Adams 
827365b711fSMark Adams       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
828365b711fSMark Adams     }
829365b711fSMark Adams #else
83099551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
83199551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
83299551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
833365b711fSMark Adams #endif
834042217e8SBarry Smith     h_mat->diag.a = aa;
835042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
836042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
837042217e8SBarry Smith     // copy pointers and metadata to device
838042217e8SBarry Smith     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
839042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
840042217e8SBarry Smith   } else {
841042217e8SBarry Smith     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
842042217e8SBarry Smith   }
843042217e8SBarry Smith   *B = d_mat;
844042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
8453fa6b06aSMark Adams   PetscFunctionReturn(0);
8463fa6b06aSMark Adams }
847